In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
# Select 10 categories
categories = [
    'alt.atheism',
    'comp.graphics',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'sci.electronics',
    'talk.politics.misc'
]

In [3]:
# Import and prepare the data
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove, shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=42)

In [4]:
# Extract targets
Y_train = data_train.target
Y_test = data_test.target

In [6]:
# Convert text to numerical vectors
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

print("Dataset dimensions:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}\n")

Dataset dimensions:
Training set: (5668, 65408)
Test set: (3773, 65408)



In [11]:
# Define scoring metrics
scores = ['precision', 'recall']

In [17]:
# Optimize SVC
print("Optimizing Support Vector Classification (SVC)...")
svc_parameters = [
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
    {'kernel': ['rbf'], 'C': [1, 10, 100, 1000]},
    {'kernel': ['poly'], 'degree': [5, 10, 15], 'C': [1, 10, 100]},
    {'kernel': ['sigmoid'], 'coef0': [0, 1, 2], 'C': [1, 10, 100]}
]

for score in scores:
    print(f"\nTuning SVC hyper-parameters for {score}")
    svc_clf = GridSearchCV(
        SVC(),
        svc_parameters,
        cv=5,
        scoring=f'{score}_macro'
    )
    svc_clf.fit(X_train, Y_train)
    
    print("\nBest parameters found:")
    print(svc_clf.best_params_)
    
    print("\nGrid scores on training set:")
    means = svc_clf.cv_results_['mean_test_score']
    stds = svc_clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, svc_clf.cv_results_['params']):
        print(f"{mean:0.3f} (+/-{std*2:0.03f}) for {params}")
    
    print("\nDetailed classification report on test set:")
    Y_pred = svc_clf.predict(X_test)
    print(classification_report(Y_test, Y_pred, zero_division=1))


Optimizing Support Vector Classification (SVC)...

Tuning SVC hyper-parameters for precision


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [18]:
# Optimize Naive Bayes
print("\nOptimizing Naive Bayes...")
nb_parameters = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]
}

for score in scores:
    print(f"\nTuning NB hyper-parameters for {score}")
    nb_clf = GridSearchCV(
        MultinomialNB(),
        nb_parameters,
        cv=5,
        scoring=f'{score}_macro'
    )
    nb_clf.fit(X_train, Y_train)
    
    print("\nBest parameters found:")
    print(nb_clf.best_params_)
    
    print("\nGrid scores on training set:")
    means = nb_clf.cv_results_['mean_test_score']
    stds = nb_clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, nb_clf.cv_results_['params']):
        print(f"{mean:0.3f} (+/-{std*2:0.03f}) for {params}")
    
    print("\nDetailed classification report on test set:")
    Y_pred = nb_clf.predict(X_test)
    print(classification_report(Y_test, Y_pred))


Optimizing Naive Bayes...

Tuning NB hyper-parameters for precision

Best parameters found:
{'alpha': 0.1}

Grid scores on training set:
0.811 (+/-0.033) for {'alpha': 0.1}
0.805 (+/-0.041) for {'alpha': 0.5}
0.802 (+/-0.035) for {'alpha': 1.0}
0.791 (+/-0.040) for {'alpha': 2.0}
0.773 (+/-0.031) for {'alpha': 5.0}

Detailed classification report on test set:
              precision    recall  f1-score   support

           0       0.79      0.75      0.77       319
           1       0.75      0.80      0.78       389
           2       0.73      0.60      0.66       394
           3       0.65      0.75      0.70       392
           4       0.77      0.71      0.74       385
           5       0.76      0.77      0.77       396
           6       0.63      0.81      0.71       398
           7       0.94      0.89      0.91       397
           8       0.76      0.64      0.70       393
           9       0.77      0.74      0.76       310

    accuracy                           0.