In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [2]:
categories = ['alt.atheism', 'sci.med', 'sci.electronics', 'comp.graphics', 'talk.politics.guns', 'sci.crypt']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Convert text to TF-IDF feature representation
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words as features
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def evaluate_feature_selection(score_func, score_name):
    selector = SelectKBest(score_func, k=200)  # Select 200 best features
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Train a logistic regression classifier
    clf  = LogisticRegression(max_iter=1000, penalty="l2")
    clf.fit(X_train_selected, y_train)

    # Predict and evaluate performance
    y_pred = clf.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Results using {score_name} for feature selection:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))


evaluate_feature_selection(chi2, "Chi-Square")

evaluate_feature_selection(mutual_info_classif, "Mutual Information")

Results using Chi-Square for feature selection:
Accuracy: 0.7372
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.68      0.74       154
           1       0.84      0.72      0.78       199
           2       0.93      0.73      0.82       211
           3       0.47      0.84      0.60       192
           4       0.80      0.71      0.75       192
           5       0.87      0.73      0.79       182

    accuracy                           0.74      1130
   macro avg       0.79      0.74      0.75      1130
weighted avg       0.79      0.74      0.75      1130





Results using Mutual Information for feature selection:
Accuracy: 0.4894
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.42      0.47       154
           1       0.44      0.52      0.48       199
           2       0.65      0.47      0.55       211
           3       0.47      0.51      0.49       192
           4       0.44      0.53      0.48       192
           5       0.47      0.48      0.48       182

    accuracy                           0.49      1130
   macro avg       0.50      0.49      0.49      1130
weighted avg       0.50      0.49      0.49      1130



