In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.optimize as opt
import sklearn.linear_model
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
RND_STATE = 749

Load the dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()

Use `HashingVectorizer` to encode the text into sparse features:

In [3]:
X = HashingVectorizer(stop_words = 'english', n_features=2**10, binary=True, token_pattern=r'\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b').fit_transform(data.data)

In [4]:
X.shape

(11314, 1024)

In [5]:
X = X.todense()

In [6]:
y = data.target

In [7]:
y.shape

(11314,)

Use the K-Fold cross-validation to split the dataset into training and test parts:

In [8]:
def splitter(rnd_st):
    return KFold(5, shuffle=True, random_state=rnd_st)

In [9]:
def test_model(clf, rnd_st):
    for train_index, test_index in splitter(rnd_st).split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print('Weighted f1-score: ', f1_score(y_test, y_pred, average='weighted'))
        print('Accuracy score: ', accuracy_score(y_test, y_pred))

Experiment with different models (L1, L2, ...)

Logistic regression L1

In [10]:
model = LogisticRegression(penalty='l1');

In [11]:
test_model(model, RND_STATE)

Weighted f1-score:  0.6451624410096805
Accuracy score:  0.6491383119752541
Weighted f1-score:  0.6442768494590695
Accuracy score:  0.6456031816173221
Weighted f1-score:  0.6414060158737694
Accuracy score:  0.6442775077330977
Weighted f1-score:  0.6397479468825665
Accuracy score:  0.6438356164383562
Weighted f1-score:  0.6913445590539319
Accuracy score:  0.6927497789566756


Logistic regression L2

In [12]:
model = LogisticRegression(penalty='l2');

In [13]:
test_model(model, RND_STATE)

Weighted f1-score:  0.7308255359817943
Accuracy score:  0.7335395492708794
Weighted f1-score:  0.7233230546823648
Accuracy score:  0.726911179849757
Weighted f1-score:  0.7103790118099625
Accuracy score:  0.7127706584180291
Weighted f1-score:  0.7180818200197306
Accuracy score:  0.7211665930181176
Weighted f1-score:  0.7500386302308621
Accuracy score:  0.7519893899204244


SGDClassifier

In [19]:
model = SGDClassifier(max_iter=5, tol=None)

In [20]:
test_model(model, RND_STATE)

Weighted f1-score:  0.7087200254087408
Accuracy score:  0.7118868758285462
Weighted f1-score:  0.7083936707964598
Accuracy score:  0.7101193106495802
Weighted f1-score:  0.7032068039116197
Accuracy score:  0.7048166151126822
Weighted f1-score:  0.6883558005693009
Accuracy score:  0.6933274414494034
Weighted f1-score:  0.7146397117500843
Accuracy score:  0.7117595048629531


What model worked best?