In [0]:
!pip install eli5

In [0]:
import eli5
import numpy as np
import pandas as pd

import sklearn
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [0]:
subj = ['sci.electronics', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'sci.crypt']

In [0]:
news_train = fetch_20newsgroups(subset='train', categories=subj)

In [0]:
news_test = fetch_20newsgroups(subset='test', categories=subj)

In [0]:
names = news_test.target_names

In [0]:
y_train = news_train.target
y_test = news_test.target

In [0]:
X_train = news_train.data
X_test = news_test.data

**CountVectorizer**

In [0]:
c_vect = CountVectorizer()

In [14]:
c_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
X_train_vec = c_vect.transform(X_train)
X_test_vec = c_vect.transform(X_test)

**LogReg, RF, SGD + crossvalidation**

In [0]:
log_reg = LogisticRegression()
rf = RandomForestClassifier()
sgd = SGDClassifier()

In [0]:
def eval_gscv(model, X, y, gs_param_grid, folds=5, verbose=True):
    gridsearch = GridSearchCV(model, param_grid=gs_param_grid,
                             n_jobs=-1,  scoring='f1_macro', cv=folds).fit(X, y)
    if verbose:
        print(f'Best score: {gridsearch.best_score_}')
        print(f'Best parameters: {gridsearch.best_params_}')
    return gridsearch

In [0]:
def validate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f = f1_score(y_test, y_pred, average='macro')
    print(f"Macro f-score is {f:.5}")

**Logistic Regression**

In [20]:
%%time

lr_gridsearch = eval_gscv(log_reg, X_train_vec, y_train,
                                gs_param_grid={'C': [1, 2, 3],
                                               'penalty':['l1', 'l2'],
                                               'intercept_scaling': [1, 2, 5],
                                               'solver': ['liblinear']})

Best score: 0.9172022710508252
Best parameters: {'C': 3, 'intercept_scaling': 2, 'penalty': 'l2', 'solver': 'liblinear'}
CPU times: user 1.78 s, sys: 1.17 s, total: 2.95 s
Wall time: 6min 35s


In [0]:
best_logreg = LogisticRegression(**lr_gridsearch.best_params_)

In [57]:
validate_model(best_logreg, X_train_vec, y_train, X_test_vec, y_test)

Macro f-score is 0.87391


**Random Forest**

In [23]:
%%time

rf_gridsearch = eval_gscv(rf, X_train_vec, y_train,
                               gs_param_grid={'n_estimators':[100, 150, 200],
                                              'max_depth': [10, 15, 20],
                                              'criterion':['gini', 'entropy']})

Best score: 0.9091353635044562
Best parameters: {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 200}
CPU times: user 2.11 s, sys: 23.2 ms, total: 2.13 s
Wall time: 1min 42s


In [0]:
best_rf = RandomForestClassifier(**rf_gridsearch.best_params_)

In [58]:
validate_model(best_rf, X_train_vec, y_train, X_test_vec, y_test)

Macro f-score is 0.85219


**SGD**

In [27]:
%%time

sgd_gridsearch = eval_gscv(sgd, X_train_vec, y_train,
                                gs_param_grid={'penalty': ['l2', 'l1', 'elasticnet'],
                                               'loss': ['log', 'hinge', 'perceptron'],
                                               'alpha': [0.01, 0.001, 0.0001]})

Best score: 0.9150774717666776
Best parameters: {'alpha': 0.01, 'loss': 'log', 'penalty': 'l2'}
CPU times: user 561 ms, sys: 103 ms, total: 664 ms
Wall time: 2min 35s


In [0]:
best_sgd = SGDClassifier(**sgd_gridsearch.best_params_)

In [59]:
validate_model(best_sgd, X_train_vec, y_train, X_test_vec, y_test)

Macro f-score is 0.85553


Результаты моделей на трейне примерно на 4-6 % лучше,чем на тесте

**Функция analyze_features(model, n)**

In [0]:
def get_word_by_index(feature):
    index_to_word = {v:k for k, v in c_vect.vocabulary_.items()}
    feature = int(feature.strip('x'))
    return index_to_word[feature]

In [0]:
def analyze_features(model, n):
    explanation = eli5.formatters.as_dataframe.explain_weights_df(model)
    if 'target' not in explanation.columns:
        explanation['word'] = explanation.feature.apply(get_word_by_index)
        explanation = explanation.nlargest(n, 'weight')
        explanation.drop('std', axis=1, inplace=True)
        explanation.reset_index(drop=True, inplace=True)
        return explanation
    target_features = []
    for target in explanation.target.unique():
        subset = explanation.loc[explanation.target == target]
        subset = subset.nlargest(n, 'weight')
        subset = subset.loc[subset.feature != '<BIAS>']
        subset['word'] = subset.feature.apply(get_word_by_index)
        subset['subject'] = names[target]
        target_features.append(subset)
    result = pd.concat(target_features, axis=0)
    result.reset_index(drop=True, inplace=True)
    return result

**Analyze_features к классификаторам**

In [37]:
analyze_features(best_logreg, 5)

Unnamed: 0,target,feature,weight,word,subject
0,0,x9130,1.048934,card,comp.sys.ibm.pc.hardware
1,0,x21656,1.009142,memory,comp.sys.ibm.pc.hardware
2,0,x13004,0.862116,drive,comp.sys.ibm.pc.hardware
3,0,x22351,0.852234,motherboard,comp.sys.ibm.pc.hardware
4,0,x28357,0.814867,scsi,comp.sys.ibm.pc.hardware
5,1,x22354,1.320162,motif,comp.windows.x
6,1,x28605,1.283296,server,comp.windows.x
7,1,x34220,1.175277,window,comp.windows.x
8,1,x34563,0.996327,x11r5,comp.windows.x
9,1,x20113,0.933125,lcs,comp.windows.x


In [38]:
analyze_features(best_sgd, 5)

Unnamed: 0,target,feature,weight,word,subject
0,0,x28357,0.543787,scsi,comp.sys.ibm.pc.hardware
1,0,x13004,0.542158,drive,comp.sys.ibm.pc.hardware
2,0,x9130,0.424814,card,comp.sys.ibm.pc.hardware
3,0,x25406,0.400781,port,comp.sys.ibm.pc.hardware
4,0,x24044,0.372567,os,comp.sys.ibm.pc.hardware
5,1,x34220,0.984805,window,comp.windows.x
6,1,x22354,0.819903,motif,comp.windows.x
7,1,x28605,0.694651,server,comp.windows.x
8,1,x13955,0.56119,entry,comp.windows.x
9,1,x34153,0.523687,widget,comp.windows.x


In [41]:
analyze_features(best_rf, 20)

Unnamed: 0,feature,weight,word
0,x19535,0.014107,key
1,x34220,0.011945,window
2,x9634,0.011686,chip
3,x9907,0.011252,clipper
4,x9130,0.010531,card
5,x16303,0.009702,government
6,x13803,0.009378,encryption
7,x11219,0.00898,crypto
8,x19569,0.008834,keys
9,x28427,0.007503,secret


**Меняю параметры CountVectorizer**

Результат не улучшился

In [0]:
c_vect = CountVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=4, stop_words='english')

In [71]:
c_vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=5,
                ngram_range=(1, 3), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
X_train_vec = c_vect.transform(X_train)
X_test_vec = c_vect.transform(X_test)

In [0]:
index_to_word = {v:k for k, v in c_vect.vocabulary_.items()}

In [76]:
%%time

# only liblinear solver supports both l1 and l2 regularization
lr_gridsearch = eval_gscv(log_reg, X_train_vec, y_train,
                                gs_param_grid={'C': [1, 2, 3],
                                               'penalty':['l1', 'l2'],
                                               'intercept_scaling': [1, 2, 5],
                                               'solver': ['liblinear']})

Best score: 0.928795869911087
Best parameters: {'C': 2, 'intercept_scaling': 2, 'penalty': 'l2', 'solver': 'liblinear'}
CPU times: user 1.77 s, sys: 1.08 s, total: 2.85 s
Wall time: 4min 46s


In [78]:
best_logreg = LogisticRegression(**lr_gridsearch.best_params_)
validate_model(best_logreg, X_train_vec, y_train, X_test_vec, y_test)

Macro f-score is 0.8716


**ELI5**

Оставила только признаки, вносящие весомый вклад. В итоге показал результат хуже :(

In [55]:
model = best_logreg
eli5.show_weights(model, top=50)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+1.049,x9130,,
+1.009,x21656,,
+0.862,x13004,,
+0.852,x22351,,
+0.815,x28357,,
+0.789,x7977,,
+0.785,x3362,,
+0.780,x12577,,
+0.777,x25406,,
+0.770,x22244,,

Weight?,Feature
+1.049,x9130
+1.009,x21656
+0.862,x13004
+0.852,x22351
+0.815,x28357
+0.789,x7977
+0.785,x3362
+0.780,x12577
+0.777,x25406
+0.770,x22244

Weight?,Feature
+1.320,x22354
+1.283,x28605
+1.175,x34220
+0.996,x34563
+0.933,x20113
+0.931,x34153
+0.782,x35247
+0.750,x10126
+0.696,x30490
+0.681,x14479

Weight?,Feature
+1.564,x9907
+1.200,x13803
+0.873,x19535
+0.838,x24910
+0.833,x28442
+0.809,x16542
+0.779,x11208
+0.692,x11219
+0.687,x26089
+0.685,x35594

Weight?,Feature
+1.145,x13589
+1.102,x9747
+1.089,x22373
+1.009,x25492
+0.894,x32168
+0.815,x6589
+0.792,x21783
+0.772,x20710
+0.755,x3161
+0.755,x12144


In [60]:
sum(sum(model.coef_ != 0))

143212

In [61]:
eli5.formatters.as_dataframe.explain_weights_df(model)

Unnamed: 0,target,feature,weight
0,0,x9130,1.048934
1,0,x21656,1.009142
2,0,x13004,0.862116
3,0,x22351,0.852234
4,0,x28357,0.814867
...,...,...,...
143211,3,x13803,-0.719491
143212,3,x34229,-0.780067
143213,3,x28605,-0.813618
143214,3,x9907,-0.977844


In [0]:
top_features = [int(i[1:]) for i in eli5.formatters.as_dataframe.explain_weights_df(model).feature if 'BIAS' not in i]

In [0]:
X_train_eli5 = X_train_vec[:,top_features]
X_test_eli5 = X_test_vec[:,top_features]

In [0]:
eli5_model = linear_model.LogisticRegression() #penalty = 'l1'

In [69]:
%%time

lr_gridsearch = eval_gscv(eli5_model, X_train_vec, y_train,
                                gs_param_grid={'C': [1, 2, 3],
                                               'penalty':['l1', 'l2'],
                                               'intercept_scaling': [1, 2, 5],
                                               'solver': ['liblinear']})

Best score: 0.9172022710508252
Best parameters: {'C': 3, 'intercept_scaling': 2, 'penalty': 'l2', 'solver': 'liblinear'}
CPU times: user 1.83 s, sys: 1.05 s, total: 2.87 s
Wall time: 6min 55s


In [68]:
validate_model(eli5_model, X_train_vec, y_train, X_test_vec, y_test)

Macro f-score is 0.86335
