In [2]:
import time
import datetime
import numpy as np
import pandas as pd
import numpy as np
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords

In [3]:
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.metrics import accuracy_score, f1_score
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

In [4]:
cats = ['sci.electronics', 'sci.space', 'comp.graphics', 'sci.crypt']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)


In [19]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
X_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target

In [6]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=8)
stratified_folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=8)
repeated_folds = RepeatedStratifiedKFold(n_splits=n_fold, n_repeats=20, random_state=8)

In [75]:
%%time
from sklearn.model_selection import GridSearchCV

parameters1 = {'solver':('newton-cg', 'sag', 'lbfgs'),
               'C':[0.001, 0.01, 0.08, 0.1, 0.15, 1.0, 10.0, 100.0],
               'max_iter':[100,150,200]}

clf1 = GridSearchCV(LogisticRegression(penalty='l2', random_state=8), parameters1, cv=folds, n_jobs=-1, verbose = True)
clf1.fit(X_train, y_train)





Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  9.0min finished


CPU times: user 6.06 s, sys: 518 ms, total: 6.58 s
Wall time: 9min 2s


GridSearchCV(cv=KFold(n_splits=5, random_state=8, shuffle=True),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=8, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.08, 0.1, 0.15, 1.0, 10.0, 100.0],
                         'max_iter': [100, 150, 200],
                         'solver': ('newton-cg', 'sag', 'lbfgs')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
            

In [94]:
clf1.best_estimator_
y_pred_LogReg = clf1.best_estimator_.predict(X_train)
train_score_LogReg  = f1_score(y_train, y_pred_LogReg, average='macro')
print(train_score_LogReg)
y_pred_LogReg = clf1.best_estimator_.predict(X_test)
test_score_LogReg  = f1_score(y_test, y_pred_LogReg, average='macro')
print(test_score_LogReg)

1.0
0.898013154198356


In [20]:
%%time
parametersSVC = {'loss':['hinge', 'squared_hinge'],
               'C':[0.001, 0.01, 0.1, 0.15, 1.0, 10.0, 100.0],
                'penalty': ['l2']}

clfSVC = GridSearchCV(LinearSVC(random_state=8), parametersSVC , cv=stratified_folds, n_jobs=-1, verbose = True)
clfSVC.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    9.0s finished


CPU times: user 483 ms, sys: 98.1 ms, total: 581 ms
Wall time: 9.23 s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=8, shuffle=True),
             error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=8, tol=0.0001, verbose=0),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 0.15, 1.0, 10.0, 100.0],
                         'loss': ['hinge', 'squared_hinge'],
                         'penalty': ['l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [8]:
clfSVC.best_estimator_
y_pred_SVC = clfSVC.best_estimator_.predict(X_train)
train_score_SVC  = f1_score(y_train, y_pred_SVC, average='macro')
print(train_score_SVC)
y_pred_SVC = clfSVC.best_estimator_.predict(X_test)
test_score_SVC  = f1_score(y_test, y_pred_SVC, average='macro')
print(test_score_SVC)

1.0
0.9030952606395727


In [220]:
%%time
parametersDT = {'criterion' : ["gini", "entropy"],
               'min_samples_split': (2,4,6)
               }

clfDT = GridSearchCV(DecisionTreeClassifier(), parametersDT , cv=folds, n_jobs=1, verbose = True)
clfDT.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   19.5s finished


CPU times: user 19.9 s, sys: 84.8 ms, total: 20 s
Wall time: 20.3 s


GridSearchCV(cv=KFold(n_splits=5, random_state=8, shuffle=True),
             error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=1,
             param_grid={'criterion': ['gini', 'entropy'],
                     

In [221]:
y_pred_DT = clfDT.best_estimator_.predict(X_train)
train_score_DT  = f1_score(y_train, y_pred_DT, average='macro')
print(train_score_DT)
y_pred_DT = clfDT.best_estimator_.predict(X_test)
test_score_DT  = f1_score(y_test, y_pred_DT, average='macro')
print(test_score_DT)

0.9889870006371541
0.7360699641230508


In [223]:
clfDT.best_estimator_.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [34]:
print('result on train: {}'.format(f1_score(y_train, train_preds, average='macro')))
print('result on test: {}'.format(f1_score(y_test, test_preds, average='macro')))

result on train: 1.0
result on test: 0.9012210103408893


In [264]:

index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
def analyze_features(model_weights, n):
    model_weights = [abs(m) for m in model_weights]
    sorted_dict = vectorizer.get_feature_names()
    res = []
    for weight, word in zip(model_weights, sorted_dict):
        res.append((weight, word))
    return (sorted(res, reverse = True, key = lambda x: x[0])[:n])
    

analyze_features(clfDT.best_estimator_.feature_importances_, 20)
for cls_coeffs in range(3):
    print(analyze_features(clf1.best_estimator_.coef_[cls_coeffs], 20))

[(1.6143824346079305, 'ýé'), (0.9730986919721937, 'ête'), (0.8703027245120281, '00'), (0.7557024631274526, 'érale'), (0.7535769211629939, 'élangea'), (0.6741572053475976, 'égligent'), (0.6659772690571834, 'ée'), (0.6539335903341825, 'ère'), (0.6484032506791605, 'çon'), (0.6425509440601267, 'çait'), (0.6183166511141552, '³ation'), (0.6121350959420025, 'ªl'), (0.6085623916148205, '000'), (0.6033340689007144, '0000'), (0.5931399209337151, 'zzi776'), (0.5795079897858701, 'zzcrm'), (0.5753858050270337, 'zz'), (0.5742684830204396, 'zyxel'), (0.56422909834358, '00000'), (0.5630059308172383, 'zysv')]
[(1.3300799026305965, 'ýé'), (1.0668283597514014, 'ête'), (1.0242396923527033, 'érale'), (0.7602097863859932, 'élangea'), (0.7466430870052031, 'égligent'), (0.6572759181590394, 'ée'), (0.6183677776157928, '00'), (0.5808792859064609, 'ère'), (0.5768924767851057, 'çon'), (0.5744278108398345, 'çait'), (0.5659630434915733, '³ation'), (0.5635174947148193, 'ªl'), (0.5355683279636426, 'zzi776'), (0.53469

In [265]:
for cls_coeffs in range(3):
    print(analyze_features(clf1.best_estimator_.coef_[cls_coeffs], 20))

[(1.6143824346079305, 'ýé'), (0.9730986919721937, 'ête'), (0.8703027245120281, '00'), (0.7557024631274526, 'érale'), (0.7535769211629939, 'élangea'), (0.6741572053475976, 'égligent'), (0.6659772690571834, 'ée'), (0.6539335903341825, 'ère'), (0.6484032506791605, 'çon'), (0.6425509440601267, 'çait'), (0.6183166511141552, '³ation'), (0.6121350959420025, 'ªl'), (0.6085623916148205, '000'), (0.6033340689007144, '0000'), (0.5931399209337151, 'zzi776'), (0.5795079897858701, 'zzcrm'), (0.5753858050270337, 'zz'), (0.5742684830204396, 'zyxel'), (0.56422909834358, '00000'), (0.5630059308172383, 'zysv')]
[(1.3300799026305965, 'ýé'), (1.0668283597514014, 'ête'), (1.0242396923527033, 'érale'), (0.7602097863859932, 'élangea'), (0.7466430870052031, 'égligent'), (0.6572759181590394, 'ée'), (0.6183677776157928, '00'), (0.5808792859064609, 'ère'), (0.5768924767851057, 'çon'), (0.5744278108398345, 'çait'), (0.5659630434915733, '³ation'), (0.5635174947148193, 'ªl'), (0.5355683279636426, 'zzi776'), (0.53469

In [266]:
clf1.best_estimator_.coef_[0]
import eli5
eli5.show_weights(clf1.best_estimator_, top=50)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+1.614,x37132,,
+0.973,x37131,,
+0.756,x37130,,
+0.754,x37129,,
+0.674,x37128,,
+0.666,x37127,,
+0.654,x37126,,
+0.648,x37125,,
+0.643,x37124,,
+0.618,x37123,,

Weight?,Feature
+1.614,x37132
+0.973,x37131
+0.756,x37130
+0.754,x37129
+0.674,x37128
+0.666,x37127
+0.654,x37126
+0.648,x37125
+0.643,x37124
+0.618,x37123

Weight?,Feature
+1.330,x37132
+1.067,x37131
+1.024,x37130
+0.760,x37129
+0.747,x37128
+0.657,x37127
+0.581,x37126
+0.577,x37125
+0.574,x37124
+0.566,x37123

Weight?,Feature
+0.905,x37132
+0.865,x37131
+0.803,x37130
+0.666,x37129
+0.641,x37128
+0.640,x37127
+0.631,x37126
+0.563,x37125
+0.552,x37124
+0.542,x37123

Weight?,Feature
+1.844,x37132
+1.027,x37131
+0.794,x37130
+0.690,x37129
+0.682,x37128
+0.601,x37127
+0.564,x37125
+0.564,x37126
+0.557,x37124
+0.518,x37123


In [267]:
for cls_coeffs in range(3):
    print(analyze_features(clfSVC.best_estimator_.coef_[cls_coeffs], 20))

[(0.30162492271247215, 'graphics'), (0.18188519115547508, 'image'), (0.15681258224084213, 'space'), (0.1420063737578954, '3d'), (0.14100193500742642, 'windows'), (0.12547008444612787, 'vga'), (0.12497960923281591, '3do'), (0.12431092810481544, 'animation'), (0.12153582181412205, '42'), (0.11967191146036604, 'file'), (0.11874610286162389, 'format'), (0.11533776874119255, 'color'), (0.11357867494340199, 'circuit'), (0.11273947077288392, 'power'), (0.10818617212162138, 'polygon'), (0.10787031663904695, 'images'), (0.10499095854537037, 'points'), (0.10487602005611484, 'pov'), (0.10486839239040009, 'files'), (0.10298198626354821, 'tiff')]
[(0.23841902239317908, 'clipper'), (0.19064756100829594, 'encryption'), (0.17954653606893747, 'key'), (0.16318269108777164, 'subject'), (0.15853659602774495, 'lines'), (0.1366460489192695, 'security'), (0.12425025661650681, 'gtoal'), (0.11602836782841593, 'pgp'), (0.10081742408723393, 'steve'), (0.10026114307276071, 'keys'), (0.10014631915978615, 'chip'), 

In [262]:

eli5.show_weights(clfSVC.best_estimator_, top=50)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.302,x17234,,
+0.182,x18890,,
+0.142,x3129,,
+0.141,x36297,,
+0.125,x35376,,
+0.125,x3138,,
+0.124,x6668,,
+0.122,x3253,,
+0.120,x15718,,
+0.119,x16119,,

Weight?,Feature
+0.302,x17234
+0.182,x18890
+0.142,x3129
+0.141,x36297
+0.125,x35376
+0.125,x3138
+0.124,x6668
+0.122,x3253
+0.120,x15718
+0.119,x16119

Weight?,Feature
+0.238,x10328
+0.191,x14473
+0.180,x20677
+0.137,x30259
+0.124,x17420
+0.116,x26349
+0.101,x31940
+0.100,x20694
+0.100,x10031
+0.100,x12706

Weight?,Feature
+0.172,x14247
+0.163,x10159
+0.155,x27036
+0.119,x18438
+0.115,x23681
+0.115,x34203
+0.111,x34930
+0.102,x4578
+0.102,x6169
+0.099,x28037

Weight?,Feature
+0.334,x31371
+0.186,x25324
+0.137,x23594
+0.119,x12256
+0.115,x26648
+0.110,x21251
+0.105,x30068
+0.097,x16442
+0.096,x20130
+0.096,x12882


In [23]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, stop_words=stopwords.words('english'))
X_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
X_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target

In [24]:
%%time
parametersSVC = {'loss':['hinge', 'squared_hinge'],
               'C':[0.001, 0.01, 0.1, 0.15, 1.0, 10.0, 100.0],
                'penalty': ['l2']}

clfSVC = GridSearchCV(LinearSVC(random_state=8), parametersSVC , cv=stratified_folds, n_jobs=-1, verbose = True)
clfSVC.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    6.1s finished


CPU times: user 460 ms, sys: 91.4 ms, total: 552 ms
Wall time: 6.35 s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=8, shuffle=True),
             error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=8, tol=0.0001, verbose=0),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 0.15, 1.0, 10.0, 100.0],
                         'loss': ['hinge', 'squared_hinge'],
                         'penalty': ['l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [None]:
fit_models()

In [31]:
%%time
def fit_models():
    parametersSVC = {'loss':['hinge', 'squared_hinge'],
                   'C':[0.001, 0.01, 0.1, 0.15, 1.0, 10.0, 100.0],
                    'penalty': ['l2']}

    clfSVC = GridSearchCV(LinearSVC(random_state=8), parametersSVC , cv=stratified_folds, n_jobs=-1, verbose = True)
    clfSVC.fit(X_train, y_train)


    y_pred_SVC = clfSVC.best_estimator_.predict(X_train)
    train_score_SVC  = f1_score(y_train, y_pred_SVC, average='macro')
    print("-------------------")
    print("LinearSVC")
    print("train_score_SVC = ", train_score_SVC)
    y_pred_SVC = clfSVC.best_estimator_.predict(X_test)
    test_score_SVC  = f1_score(y_test, y_pred_SVC, average='macro')
    print("test_score_SVC = ", test_score_SVC)


    parametersDT = {'criterion' : ["gini", "entropy"],
                   'min_samples_split': (2,4,6)
                   }

    clfDT = GridSearchCV(DecisionTreeClassifier(), parametersDT , cv=folds, n_jobs=-1, verbose = True)
    clfDT.fit(X_train, y_train)


    y_pred_DT = clfDT.best_estimator_.predict(X_train)
    train_score_DT  = f1_score(y_train, y_pred_DT, average='macro')
    print("-------------------")
    print("train_score_DT = ", train_score_DT)
    y_pred_DT = clfDT.best_estimator_.predict(X_test)
    test_score_DT  = f1_score(y_test, y_pred_DT, average='macro')
    print("test_score_DT = ", test_score_DT)


    parameters1 = {'solver':('newton-cg', 'sag', 'lbfgs'),
                   'C':[0.001, 0.01, 0.08, 0.1, 0.15, 1.0, 10.0, 100.0],
                   'max_iter':[100,150,200]}

    clf1 = GridSearchCV(LogisticRegression(penalty='l2', random_state=8), parameters1, cv=folds, n_jobs=-1, verbose = True)
    clf1.fit(X_train, y_train)


    clf1.best_estimator_
    y_pred_LogReg = clf1.best_estimator_.predict(X_train)
    train_score_LogReg  = f1_score(y_train, y_pred_LogReg, average='macro')
    print("-------------------")
    print("train_score_LogReg = ", train_score_LogReg)
    y_pred_LogReg = clf1.best_estimator_.predict(X_test)
    test_score_LogReg  = f1_score(y_test, y_pred_LogReg, average='macro')
    print("test_score_LogReg = ", test_score_LogReg)




CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 10 µs


In [32]:
vectorizer = CountVectorizer(ngram_range=(1,5), analyzer='char', min_df=3, max_df=0.85)
X_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
X_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target

KeyboardInterrupt: 

In [None]:
fit_models()

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 37.8min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed: 50.2min finished


-------------------
LinearSVC
train_score_SVC =  1.0
test_score_SVC =  0.89890992686582
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.4min finished


-------------------
train_score_DT =  1.0
test_score_DT =  0.7557133857886873
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 55.3min
