In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load 20NG dataset with selected categories
categories = ['alt.atheism', 'sci.med', 'sci.electronics', 'comp.graphics', 'talk.politics.guns', 'sci.crypt']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Convert text data to TF-IDF feature matrix
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train L1 - regularized logistic regression model
l1_model = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
l1_model.fit(X_train, y_train)

# Get absolute coefficients and sort them.
coef_abs = np.abs(l1_model.coef_).sum(axis=0)
top_200_indices = np.argsort(coef_abs)[-200:]

#Transform the dataset using only these top 200 features
X_train_top = X_train[:, top_200_indices]
X_test_top = X_test[:, top_200_indices]

# Train a new logistic regression model on the transformed dataset
new_model = LogisticRegression(penalty='l2', solver='liblinear')
new_model.fit(X_train_top, y_train)

# Evaluate the model
y_pred = new_model.predict(X_test_top)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy after L1-based feature selection: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

model = LogisticRegression(penalty='l2', solver='liblinear')
model.fit(X_train, y_train)
y_pred_old = model.predict(X_test)
accuracy_old = accuracy_score(y_test, y_pred_old)
print(f"Accuracy before L1-based feature selection: {accuracy_old:.2f}")
print(classification_report(y_test, y_pred_old))


# Display top 20 selected words
selected_features = np.array(vectorizer.get_feature_names_out())[top_200_indices]
print("Top 20 Selected Features:", selected_features[:20])

Accuracy after L1-based feature selection: 0.55
              precision    recall  f1-score   support

           0       0.81      0.31      0.45       154
           1       0.81      0.58      0.67       199
           2       0.87      0.65      0.74       211
           3       0.29      0.80      0.43       192
           4       0.55      0.43      0.49       192
           5       0.92      0.48      0.63       182

    accuracy                           0.55      1130
   macro avg       0.71      0.54      0.57      1130
weighted avg       0.71      0.55      0.57      1130

Accuracy before L1-based feature selection: 0.84
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       154
           1       0.85      0.88      0.87       199
           2       0.94      0.81      0.87       211
           3       0.69      0.86      0.76       192
           4       0.84      0.84      0.84       192
           5       0.87      0.79  

In [None]:
# selector = SelectKBest(LogisticRegression(penalty='l1', solver='liblinear', C=0.1), k=200)
# X_train_selected = selector.fit_transform(X_train, y_train)
# X_test_selected = selector.transform(X_test)

# m = LogisticRegression(penalty='l2', solver='liblinear')
# m.fit(X_train_selected, y_train)

# # Evaluate the model
# y_pred = m.predict(X_test_selected)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy after L1-based feature selection: {accuracy:.2f}")
# print(classification_report(y_test, y_pred))