# Newsgroup Text Classification

Classifying newsgroup documents into 20 categories using three approaches:
- Multinomial Naive Bayes
- Logistic Regression
- 1D Convolutional Neural Network (CNN)

Data is represented as document-term matrices built from word frequency counts.

In [None]:
import pandas as pd
import numpy as np
import copy

from tensorflow import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D
from keras.utils import to_categorical

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [None]:
with open("data/newsgrouplabels.txt", 'r') as news_labels_file:
    lines = news_labels_file.readlines()
    news_label = [i.strip() for i in lines]

## Data Preprocessing

In [None]:
col_data = ['data']
col_labels = ['labels']

test_data = pd.read_csv("data/test.data", names=col_data, delimiter=",")
test_label = pd.read_csv("data/test.label", names=col_labels, delimiter=",")
train_data = pd.read_csv("data/train.data", names=col_data, delimiter=",")
train_label = pd.read_csv("data/train.label", names=col_labels, delimiter=",")

# Split "docId wordId count" into separate columns
train_data = train_data["data"].str.split(expand=True)
test_data = test_data['data'].str.split(expand=True)

split_data_labels = ["docId", "wordId", "count"]
train_data.columns = split_data_labels
test_data.columns = split_data_labels

train_data = train_data.apply(pd.to_numeric, errors='coerce')
test_data = test_data.apply(pd.to_numeric, errors='coerce')

In [None]:
# Build document-term matrices
train_dtm = train_data.pivot_table(index="docId", columns="wordId", values="count", aggfunc="sum", fill_value=0)
test_dtm = test_data.pivot_table(index="docId", columns="wordId", values="count", aggfunc="sum", fill_value=0)

# Align test DTM columns to match training DTM (fill missing words with 0)
test_dtm = test_dtm.reindex(columns=train_dtm.columns, fill_value=0)

print(train_dtm.shape)
print(test_dtm.shape)

In [None]:
X_train = train_dtm
y_train = train_label.values.ravel()
X_test = test_dtm
y_test = test_label.values.ravel()

## Multinomial Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
NB_pred = nb.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, NB_pred)}")
print("\nNaive Bayes metrics:")
print(classification_report(y_test, NB_pred, target_names=news_label))

## Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=10)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, lr_pred)}")
print("\nLogistic Regression metrics:")
print(classification_report(y_test, lr_pred, target_names=news_label))

## CNN Classifier (Conv1D)

In [None]:
# Convert DTMs to tensors and add channel dimension for Conv1D
train_tensor = tf.expand_dims(tf.convert_to_tensor(train_dtm.values), axis=-1)
test_tensor = tf.expand_dims(tf.convert_to_tensor(test_dtm.values), axis=-1)

# One-hot encode labels (convert 1-indexed to 0-indexed first)
train_labels_one_hot = to_categorical(train_label.values.flatten() - 1, num_classes=20)
test_labels_one_hot = to_categorical(test_label.values.flatten() - 1, num_classes=20)

num_filters = 8
filter_size = 3
pool_size = 2

cnn_model = Sequential([
    Conv1D(num_filters, filter_size, input_shape=(train_dtm.shape[1], 1)),
    MaxPooling1D(pool_size=pool_size),
    Flatten(),
    Dense(20, activation='softmax'),
])

cnn_model.compile('adam', loss='categorical_crossentropy', metrics=['accuracy'])

cnn_model.fit(
    train_tensor,
    train_labels_one_hot,
    epochs=3,
    validation_data=(test_tensor, test_labels_one_hot),
)

In [None]:
CNN_prediction_probs = cnn_model.predict(test_tensor)
y_pred = tf.argmax(CNN_prediction_probs, axis=1).numpy() + 1  # undo 0-indexing

print("CNN metrics:")
print(classification_report(y_test, y_pred, target_names=news_label))

## Document Classification Function

Given a raw text document, preprocess it (lowercase, remove stopwords, lemmatize) and classify it using the trained Naive Bayes model â€” selected as it performed best of the three approaches.

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

with open("data/vocabulary.txt", 'r') as vocab:
    lines = vocab.readlines()
    lines = [i.strip() for i in lines]
    vocab_df = pd.DataFrame(data=lines, columns=['vocab'])

def count_vectorize(filtered_sentence):
    word_count_dict = dict(Counter(filtered_sentence))
    vector = [0] * len(vocab_df)
    for word in word_count_dict:
        if word in vocab_df['vocab'].values:
            index = vocab_df[vocab_df['vocab'] == word].index[0]
            vector[index] = word_count_dict[word]
    return pd.DataFrame([vector], columns=range(len(vocab_df)))

def preprocess_vectorize(document):
    lemmatizer = WordNetLemmatizer()
    with open(document, "r", encoding="utf-8", errors="replace") as file:
        words = re.findall(r'\b\w+\b', file.read().lower())
        words_str = " ".join(words)
        word_tokens = word_tokenize(words_str)
        filtered_sentence = [w for w in word_tokens if w not in stop_words]
        lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_sentence]
        vectorized_sentence = count_vectorize(lemmatized_words)
    return vectorized_sentence

def NB_classify_document(document, train_dtm, y_train):
    func_dtm = preprocess_vectorize(document)
    aligned_train = train_dtm.reindex(columns=func_dtm.columns, fill_value=0)
    nb = MultinomialNB()
    nb.fit(aligned_train, y_train)
    NB_pred = nb.predict(func_dtm)
    print(f"NB Model predicts this document is in the {news_label[NB_pred[0]-1]} news group")
    return NB_pred

In [None]:
# Test with sample documents
NB_classify_document("data/49960", train_dtm, y_train)
NB_classify_document("data/51060", train_dtm, y_train)
NB_classify_document("data/72052", train_dtm, y_train)
NB_classify_document("data/101725", train_dtm, y_train)