# Newsgroup Text Classification

Classifying newsgroup documents into 20 categories using three approaches:
- Multinomial Naive Bayes
- Logistic Regression
- 1D Convolutional Neural Network (CNN)

Data is represented as document-term matrices built from word frequency counts.

In [1]:
import pandas as pd
import numpy as np
import copy

from tensorflow import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D
from keras.utils import to_categorical

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [2]:
with open("data/newsgrouplabels.txt", 'r') as news_labels_file:
    lines = news_labels_file.readlines()
    news_label = [i.strip() for i in lines]

## Data Preprocessing

In [3]:
col_data = ['data']
col_labels = ['labels']

test_data = pd.read_csv("data/test.data", names=col_data, delimiter=",")
test_label = pd.read_csv("data/test.label", names=col_labels, delimiter=",")
train_data = pd.read_csv("data/train.data", names=col_data, delimiter=",")
train_label = pd.read_csv("data/train.label", names=col_labels, delimiter=",")

# Split "docId wordId count" into separate columns
train_data = train_data["data"].str.split(expand=True)
test_data = test_data['data'].str.split(expand=True)

split_data_labels = ["docId", "wordId", "count"]
train_data.columns = split_data_labels
test_data.columns = split_data_labels

train_data = train_data.apply(pd.to_numeric, errors='coerce')
test_data = test_data.apply(pd.to_numeric, errors='coerce')

In [4]:
# Build document-term matrices
train_dtm = train_data.pivot_table(index="docId", columns="wordId", values="count", aggfunc="sum", fill_value=0)
test_dtm = test_data.pivot_table(index="docId", columns="wordId", values="count", aggfunc="sum", fill_value=0)

# Align test DTM columns to match training DTM (fill missing words with 0)
test_dtm = test_dtm.reindex(columns=train_dtm.columns, fill_value=0)

print(train_dtm.shape)
print(test_dtm.shape)

(11269, 53975)
(7505, 53975)


In [5]:
X_train = train_dtm
y_train = train_label.values.ravel()
X_test = test_dtm
y_test = test_label.values.ravel()

## Multinomial Naive Bayes

In [6]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
NB_pred = nb.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, NB_pred)}")
print("\nNaive Bayes metrics:")
print(classification_report(y_test, NB_pred, target_names=news_label))

Accuracy: 0.7846768820786143

Naive Bayes metrics:
                          precision    recall  f1-score   support

             alt.atheism       0.70      0.75      0.73       318
           comp.graphics       0.67      0.76      0.71       389
 comp.os.ms-windows.misc       0.82      0.53      0.65       391
comp.sys.ibm.pc.hardware       0.60      0.77      0.68       392
   comp.sys.mac.hardware       0.79      0.72      0.75       383
          comp.windows.x       0.82      0.78      0.80       390
            misc.forsale       0.91      0.62      0.73       382
               rec.autos       0.79      0.90      0.84       395
         rec.motorcycles       0.94      0.89      0.91       397
      rec.sport.baseball       0.96      0.88      0.92       397
        rec.sport.hockey       0.94      0.96      0.95       399
               sci.crypt       0.75      0.91      0.82       395
         sci.electronics       0.78      0.66      0.72       393
                 sci.med

## Logistic Regression

In [7]:
lr_model = LogisticRegression(max_iter=10)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, lr_pred)}")
print("\nLogistic Regression metrics:")
print(classification_report(y_test, lr_pred, target_names=news_label))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.3589606928714191

Logistic Regression metrics:
                          precision    recall  f1-score   support

             alt.atheism       0.21      0.29      0.25       318
           comp.graphics       0.30      0.47      0.36       389
 comp.os.ms-windows.misc       0.56      0.06      0.11       391
comp.sys.ibm.pc.hardware       0.37      0.51      0.43       392
   comp.sys.mac.hardware       0.78      0.10      0.18       383
          comp.windows.x       0.52      0.35      0.42       390
            misc.forsale       0.60      0.05      0.09       382
               rec.autos       0.69      0.35      0.46       395
         rec.motorcycles       0.32      0.73      0.44       397
      rec.sport.baseball       0.39      0.48      0.43       397
        rec.sport.hockey       0.49      0.58      0.53       399
               sci.crypt       0.38      0.51      0.43       395
         sci.electronics       0.29      0.16      0.21       393
                

## CNN Classifier (Conv1D)

In [8]:
# Convert DTMs to tensors and add channel dimension for Conv1D
train_tensor = tf.expand_dims(tf.convert_to_tensor(train_dtm.values), axis=-1)
test_tensor = tf.expand_dims(tf.convert_to_tensor(test_dtm.values), axis=-1)

# One-hot encode labels (convert 1-indexed to 0-indexed first)
train_labels_one_hot = to_categorical(train_label.values.flatten() - 1, num_classes=20)
test_labels_one_hot = to_categorical(test_label.values.flatten() - 1, num_classes=20)

num_filters = 8
filter_size = 3
pool_size = 2

cnn_model = Sequential([
    Conv1D(num_filters, filter_size, input_shape=(train_dtm.shape[1], 1)),
    MaxPooling1D(pool_size=pool_size),
    Flatten(),
    Dense(20, activation='softmax'),
])

cnn_model.compile('adam', loss='categorical_crossentropy', metrics=['accuracy'])

cnn_model.fit(
    train_tensor,
    train_labels_one_hot,
    epochs=3,
    validation_data=(test_tensor, test_labels_one_hot),
)

Epoch 1/3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 44ms/step - accuracy: 0.6038 - loss: 1.6414 - val_accuracy: 0.6778 - val_loss: 1.7091
Epoch 2/3
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 40ms/step - accuracy: 0.9610 - loss: 0.2364 - val_accuracy: 0.7760 - val_loss: 1.0390
Epoch 3/3
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 48ms/step - accuracy: 0.9903 - loss: 0.0822 - val_accuracy: 0.7676 - val_loss: 1.2493


<keras.src.callbacks.history.History at 0x378cc6750>

In [9]:
CNN_prediction_probs = cnn_model.predict(test_tensor)
y_pred = tf.argmax(CNN_prediction_probs, axis=1).numpy() + 1  # undo 0-indexing

print("CNN metrics:")
print(classification_report(y_test, y_pred, target_names=news_label))

[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step
CNN metrics:
                          precision    recall  f1-score   support

             alt.atheism       0.54      0.85      0.66       318
           comp.graphics       0.58      0.74      0.65       389
 comp.os.ms-windows.misc       0.67      0.68      0.68       391
comp.sys.ibm.pc.hardware       0.67      0.55      0.60       392
   comp.sys.mac.hardware       0.64      0.72      0.68       383
          comp.windows.x       0.86      0.65      0.74       390
            misc.forsale       0.90      0.80      0.84       382
               rec.autos       0.86      0.87      0.86       395
         rec.motorcycles       0.95      0.91      0.93       397
      rec.sport.baseball       0.92      0.87      0.89       397
        rec.sport.hockey       0.94      0.92      0.93       399
               sci.crypt       0.84      0.87      0.85       395
         sci.electronics       0.68      0.68      0

## Document Classification Function

Given a raw text document, preprocess it (lowercase, remove stopwords, lemmatize) and classify it using the trained Naive Bayes model — selected as it performed best of the three approaches.

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

with open("data/vocabulary.txt", 'r') as vocab:
    lines = vocab.readlines()
    lines = [i.strip() for i in lines]
    vocab_df = pd.DataFrame(data=lines, columns=['vocab'])

def count_vectorize(filtered_sentence):
    word_count_dict = dict(Counter(filtered_sentence))
    vector = [0] * len(vocab_df)
    for word in word_count_dict:
        if word in vocab_df['vocab'].values:
            index = vocab_df[vocab_df['vocab'] == word].index[0]
            vector[index] = word_count_dict[word]
    return pd.DataFrame([vector], columns=range(len(vocab_df)))

def preprocess_vectorize(document):
    lemmatizer = WordNetLemmatizer()
    with open(document, "r", encoding="utf-8", errors="replace") as file:
        words = re.findall(r'\b\w+\b', file.read().lower())
        words_str = " ".join(words)
        word_tokens = word_tokenize(words_str)
        filtered_sentence = [w for w in word_tokens if w not in stop_words]
        lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_sentence]
        vectorized_sentence = count_vectorize(lemmatized_words)
    return vectorized_sentence

def NB_classify_document(document, train_dtm, y_train):
    func_dtm = preprocess_vectorize(document)
    aligned_train = train_dtm.reindex(columns=func_dtm.columns, fill_value=0)
    nb = MultinomialNB()
    nb.fit(aligned_train, y_train)
    NB_pred = nb.predict(func_dtm)
    print(f"NB Model predicts this document is in the {news_label[NB_pred[0]-1]} news group")
    return NB_pred

[nltk_data] Downloading package wordnet to /Users/eitan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/eitan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Test with sample documents
NB_classify_document("data/49960", train_dtm, y_train)
NB_classify_document("data/51060", train_dtm, y_train)
NB_classify_document("data/72052", train_dtm, y_train)
NB_classify_document("data/101725", train_dtm, y_train)

NB Model predicts this document is in the alt.atheism news group
NB Model predicts this document is in the alt.atheism news group
NB Model predicts this document is in the alt.atheism news group
NB Model predicts this document is in the rec.motorcycles news group


array([9])