In [None]:
%pip install numpy
%pip install scikit-learn
%pip install tensorflow
%pip install pandas
%pip install nltk

In [None]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

nltk.download('wordnet')

In [None]:
# Part 1: Data Loading and Preprocessing

def load_vocabulary(file_path):
    with open(file_path, 'r') as f:
        vocabulary = [line.strip() for line in f]
    return vocabulary

def load_newsgrouplabels(file_path):
    newsgrouplabels = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            newsgrouplabels.append(parts[0]) # Take only the newsgroup name (first part)
    return newsgrouplabels

def load_labels(file_path):
    with open(file_path, 'r') as f:
        labels = [int(line.strip()) for line in f]
    return labels

def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            doc_id, word_id, count = map(int, line.strip().split())
            data.append({'docId': doc_id, 'wordId': word_id, 'count': count})
    return data

def process_vocabulary(vocabulary):
    processed_vocabulary = []
    punctuation_remover = str.maketrans('', '', string.punctuation)
    stop_words = ENGLISH_STOP_WORDS
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    for word in vocabulary:
        word_lower = word.lower()
        word_no_punctuation = word_lower.translate(punctuation_remover)
        # if word_no_punctuation and word_no_punctuation not in stop_words:
        word_stemmed = stemmer.stem(word_no_punctuation)
        word_lemmatized = lemmatizer.lemmatize(word_stemmed)
        processed_vocabulary.append(word_lemmatized)

    return processed_vocabulary

# Load data
vocabulary = process_vocabulary(load_vocabulary('vocabulary.txt'))
newsgrouplabels = load_newsgrouplabels('train.map')
train_labels = load_labels('train.label')
test_labels = load_labels('test.label')
train_data = load_data('train.data')
test_data = load_data('test.data')

In [None]:
# Convert data to document-term matrix format
def create_document_term_matrix(data, data_labels, vocab_size, vectorizer=None):
    doc_term_lists = [[] for _ in range(len(data_labels))]
    for item in data:
        doc_id = item['docId']
        word_id = item['wordId']
        count = item['count']
        doc_term_lists[doc_id - 1].append({'word_id': word_id, 'count': count})

    documents = []
    for doc_terms in doc_term_lists:
        doc_dict = {}
        for term_info in doc_terms:
            doc_dict[vocabulary[term_info['word_id'] - 1]] = doc_dict.get(vocabulary[term_info['word_id'] - 1], 0) + term_info['count']
        documents.append(doc_dict)

    if vectorizer is None:
        vectorizer = DictVectorizer()
        X = vectorizer.fit_transform(documents)
        feature_names = vectorizer.feature_names_
        return X, np.array(data_labels), feature_names, vectorizer
    else:
        X = vectorizer.transform(documents)
        # Return vectorizer even if not fitted in this call for consistency
        return X, np.array(data_labels), vectorizer.feature_names_ , vectorizer


# Create document-term matrices for train and test data
vocab_size = len(vocabulary)
X_train_counts, y_train, feature_names, vectorizer = create_document_term_matrix(train_data, train_labels, vocab_size)
# Pass fitted vectorizer
X_test_counts, y_test, _, _ = create_document_term_matrix(test_data, test_labels, vocab_size, vectorizer=vectorizer)

In [None]:
# Part 2: Train and Evaluate Naive Bayes and Logistic Regression

# Naive Bayes
model_nb = MultinomialNB()
model_nb.fit(X_train_counts, y_train)
y_pred_nb = model_nb.predict(X_test_counts)

In [None]:
# Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_counts, y_train)
y_pred_lr = model_lr.predict(X_test_counts)

In [None]:
# Part 3: CNN Classifier
# Prepare data for CNN - using word IDs directly as input sequences per document

def create_document_word_id_sequences(data, data_labels, max_sequence_length):
    doc_word_id_lists = [[] for _ in range(len(data_labels))]
    for item in data:
        doc_id = item['docId']
        word_id = item['wordId']
        count = item['count']
        if doc_id < len(data_labels):
            for _ in range(count):
                doc_word_id_lists[doc_id].append(word_id)

    X_cnn = []
    for word_ids in doc_word_id_lists:
        # Truncate sequences longer than max_sequence_length
        if len(word_ids) > max_sequence_length:
            word_ids = word_ids[:max_sequence_length]
        # Pad sequences shorter than max_sequence_length
        else:
            padding_needed = max_sequence_length - len(word_ids)
            word_ids.extend([0] * padding_needed) # Use 0 as padding value
        X_cnn.append(word_ids)

    y_cnn = np.array(data_labels)
    return np.array(X_cnn), y_cnn

max_sequence_length = 500 # Adjust as needed, based on document lengths. Can calculate from train data.
X_train_cnn, y_train_cnn = create_document_word_id_sequences(train_data, train_labels, max_sequence_length)
X_test_cnn, y_test_cnn = create_document_word_id_sequences(test_data, test_labels, max_sequence_length)

num_classes = len(newsgrouplabels)
vocab_size_cnn = len(vocabulary)

In [None]:
# CNN Model
embedding_dim = 100
num_filters = 128
kernel_size = 5
pool_size = 4

model_cnn = Sequential([
    Embedding(input_dim=vocab_size_cnn, output_dim=embedding_dim, input_length=max_sequence_length),
    Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'),
    MaxPooling1D(pool_size=pool_size),
    GlobalMaxPooling1D(),
    Dense(num_classes, activation='softmax')
])


model_cnn.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train CNN
epochs = 10
batch_size = 32
model_cnn.fit(X_train_cnn, y_train_cnn, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=0) # Silent training

# Evaluate CNN
y_pred_cnn_probs = model_cnn.predict(X_test_cnn, verbose=0) # Silent prediction
y_pred_cnn = np.argmax(y_pred_cnn_probs, axis=1)

In [None]:
# Part 4: Evaluation and Report

def evaluate_classifier(y_true, y_pred, labels, classifier_name):
    target_names_list = [labels[i] for i in np.unique(y_true)]
    report = classification_report(y_true, y_pred, labels=np.unique(y_true), target_names=target_names_list, output_dict=True)
    print(f"Classification Report for {classifier_name}:\n")
    table_data = []
    for class_id in np.unique(y_true):
        class_label = labels[class_id]
        row = {
            'class_id': class_id,
            'Precision': report[class_label]['precision'],
            'Recall': report[class_label]['recall'],
            'F1-score': report[class_label]['f1-score'],
            'support': int(report[class_label]['support'])
        }
        table_data.append(row)

    # Calculate overall accuracy
    overall_accuracy = report['accuracy']
    print(f"Overall Accuracy: {overall_accuracy:.4f}\n")

    return table_data, overall_accuracy


# Evaluate and print reports
nb_table, nb_accuracy = evaluate_classifier(y_test, y_pred_nb, newsgrouplabels, "Naive Bayes")
lr_table, lr_accuracy = evaluate_classifier(y_test, y_pred_lr, newsgrouplabels, "Logistic Regression")
cnn_table, cnn_accuracy = evaluate_classifier(y_test_cnn, y_pred_cnn, newsgrouplabels, "CNN")

print("Naive Bayes Results:")
print(pd.DataFrame(nb_table))
print("\nLogistic Regression Results:")
print(pd.DataFrame(lr_table))
print("\nCNN Results:")
print(pd.DataFrame(cnn_table))


print(f"\nOverall Accuracies:")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"CNN Accuracy: {cnn_accuracy:.4f}")