In [20]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from nltk.corpus import stopwords

In [12]:
stopwords_en = stopwords.words("english")

In [10]:
data_train = fetch_20newsgroups(subset="train", categories=["comp.graphics", "comp.windows.x", "rec.motorcycles", "rec.autos"])
data_test = fetch_20newsgroups(subset="test", categories=["comp.graphics", "comp.windows.x", "rec.motorcycles", "rec.autos"])

## Обучение без обработки признаков

In [92]:
simple_vectorizer = CountVectorizer()
simple_vectorizer.fit(data_train.data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [104]:
def evaluate_model(model_class, vectorizer, param_grid):
    pipeline = Pipeline([("vect", vectorizer),
                         ("clf", model_class())])
    grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1, cv=5)
    grid_search.fit(data_train.data, data_train.target)
    predictions = grid_search.best_estimator_.predict(data_test.data)
    print(classification_report(data_test.target, predictions))
    print("Conf matrix")
    print(confusion_matrix(data_test.target, predictions))
    return grid_search.best_estimator_

In [93]:
params = {
    "clf__C": [.1, .2, .5, .8, 1., 2, 10],
    "clf__penalty": ["l1", "l2"],
    "clf__solver": ["liblinear"]
}
log_reg = evaluate_model(LogisticRegression, simple_vectorizer, params)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  3.0min finished


              precision    recall  f1-score   support

           0       0.79      0.90      0.84       389
           1       0.90      0.76      0.82       395
           2       0.92      0.94      0.93       396
           3       0.95      0.94      0.94       398

    accuracy                           0.88      1578
   macro avg       0.89      0.88      0.88      1578
weighted avg       0.89      0.88      0.88      1578

Conf matrix
[[350  25  10   4]
 [ 83 301   5   6]
 [  9   5 371  11]
 [  2   4  19 373]]


In [94]:
params = {
    "clf__C": [.1, .2, .5, .8, 1., 2, 10],
    "clf__penalty": ["l2"],
    "clf__loss": ["hinge", "squared_hinge"]
}
svm = evaluate_model(LinearSVC, simple_vectorizer, params)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   21.2s finished


              precision    recall  f1-score   support

           0       0.77      0.89      0.83       389
           1       0.88      0.75      0.81       395
           2       0.90      0.90      0.90       396
           3       0.94      0.93      0.93       398

    accuracy                           0.87      1578
   macro avg       0.87      0.87      0.87      1578
weighted avg       0.87      0.87      0.87      1578

Conf matrix
[[347  27  12   3]
 [ 85 298   6   6]
 [ 16   8 358  14]
 [  4   4  21 369]]


In [95]:
params = {
    "clf__criterion": ["gini", "entropy"],
    "clf__splitter": ["best", "random"],
}
tree = evaluate_model(DecisionTreeClassifier, simple_vectorizer, params)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.3s finished


              precision    recall  f1-score   support

           0       0.67      0.67      0.67       389
           1       0.70      0.70      0.70       395
           2       0.76      0.79      0.77       396
           3       0.86      0.84      0.85       398

    accuracy                           0.75      1578
   macro avg       0.75      0.75      0.75      1578
weighted avg       0.75      0.75      0.75      1578

Conf matrix
[[260  76  36  17]
 [ 76 276  26  17]
 [ 34  29 311  22]
 [ 17  13  35 333]]


In [99]:
index_to_word = {v:k for k,v in simple_vectorizer.vocabulary_.items()}

def analyze_features(coeffs, n):
    top = sorted(index_to_word.items(), key=lambda idx_wrd: coeffs[idx_wrd[0]], reverse=True)
    for word in islice(top, n):
        print(word[1])

models = [
    (type(log_reg["clf"]), log_reg["clf"].coef_),
    (type(svm["clf"]), svm["clf"].coef_),
    (type(tree["clf"]), tree["clf"].feature_importances_)
]

for model_name, coeff in models:
    for target in coeff if len(coeff.shape) > 1 else [coeff]:
        print(str(model_name))
        analyze_features(target, 10)
        print("______________")

<class 'sklearn.linear_model.logistic.LogisticRegression'>
graphics
images
3d
image
pov
card
files
tiff
vga
cview
______________
<class 'sklearn.linear_model.logistic.LogisticRegression'>
motif
window
server
x11r5
mit
widget
xterm
lcs
an
internet
______________
<class 'sklearn.linear_model.logistic.LogisticRegression'>
car
cars
toyota
auto
automotive
eliot
ford
distribution
dealer
______________
<class 'sklearn.linear_model.logistic.LogisticRegression'>
dod
bike
motorcycle
bikes
motorcycles
ride
riding
bmw
cornell
being
______________
<class 'sklearn.svm.classes.LinearSVC'>
graphics
images
pov
3d
image
card
files
vga
24
algorithm
______________
<class 'sklearn.svm.classes.LinearSVC'>
motif
window
x11r5
server
mit
widget
lcs
xterm
an
terminal
______________
<class 'sklearn.svm.classes.LinearSVC'>
car
cars
toyota
auto
eliot
automotive
distribution
testing
buying
______________
<class 'sklearn.svm.classes.LinearSVC'>
bike
dod
motorcycle
bikes
motorcycles
riding
cornell
ride
tools
being
__

## Обучение с предобработкой и нграммами

In [101]:
word_reg = r"\b[a-z]+\b"

In [102]:
vectorizer = CountVectorizer(stop_words=stopwords_en, token_pattern=word_reg, ngram_range=(1, 3))
vectorizer.fit(data_train.data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='\\b[a-z]+\\b',
                tokenizer=None, vocabulary=None)

In [105]:
params = {
    "clf__C": [.1, .2, .5, .8, 1., 2, 10],
    "clf__penalty": ["l1", "l2"],
    "clf__solver": ["liblinear"]
}
log_reg = evaluate_model(LogisticRegression, vectorizer, params)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  3.4min finished


              precision    recall  f1-score   support

           0       0.79      0.91      0.85       389
           1       0.92      0.80      0.86       395
           2       0.93      0.93      0.93       396
           3       0.96      0.94      0.95       398

    accuracy                           0.90      1578
   macro avg       0.90      0.90      0.90      1578
weighted avg       0.90      0.90      0.90      1578

Conf matrix
[[353  19  12   5]
 [ 72 317   3   3]
 [ 15   4 369   8]
 [  6   4  14 374]]


In [106]:
params = {
    "clf__C": [.1, .2, .5, .8, 1., 2, 10],
    "clf__penalty": ["l2"],
    "clf__loss": ["hinge", "squared_hinge"]
}
svm = evaluate_model(LinearSVC, vectorizer, params)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  1.3min finished


              precision    recall  f1-score   support

           0       0.80      0.92      0.85       389
           1       0.92      0.80      0.86       395
           2       0.93      0.93      0.93       396
           3       0.97      0.94      0.95       398

    accuracy                           0.90      1578
   macro avg       0.90      0.90      0.90      1578
weighted avg       0.90      0.90      0.90      1578

Conf matrix
[[357  19  11   2]
 [ 71 317   4   3]
 [ 15   4 369   8]
 [  6   4  13 375]]


In [107]:
params = {
    "clf__criterion": ["gini", "entropy"],
    "clf__splitter": ["best", "random"],
}
tree = evaluate_model(DecisionTreeClassifier, vectorizer, params)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   29.2s finished


              precision    recall  f1-score   support

           0       0.66      0.72      0.69       389
           1       0.74      0.71      0.72       395
           2       0.79      0.77      0.78       396
           3       0.86      0.83      0.84       398

    accuracy                           0.76      1578
   macro avg       0.76      0.76      0.76      1578
weighted avg       0.76      0.76      0.76      1578

Conf matrix
[[279  74  26  10]
 [ 81 281  20  13]
 [ 41  18 305  32]
 [ 24   9  35 330]]


In [108]:
index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}

def analyze_features(coeffs, n):
    top = sorted(index_to_word.items(), key=lambda idx_wrd: coeffs[idx_wrd[0]], reverse=True)
    for word in islice(top, n):
        print(word[1])

models = [
    (type(log_reg["clf"]), log_reg["clf"].coef_),
    (type(svm["clf"]), svm["clf"].coef_),
    (type(tree["clf"]), tree["clf"].feature_importances_)
]

for model_name, coeff in models:
    for target in coeff if len(coeff.shape) > 1 else [coeff]:
        print(str(model_name))
        analyze_features(target, 10)
        print("______________")

<class 'sklearn.linear_model.logistic.LogisticRegression'>
graphics
image
images
files
card
tiff
pov
format
vga
software
______________
<class 'sklearn.linear_model.logistic.LogisticRegression'>
motif
x
window
widget
server
xterm
mit
mit edu
application
set
______________
<class 'sklearn.linear_model.logistic.LogisticRegression'>
car
cars
toyota
auto
ford
engine
dealer
oil
automotive
eliot
______________
<class 'sklearn.linear_model.logistic.LogisticRegression'>
bike
dod
bikes
motorcycle
ride
riding
motorcycles
bmw
sale
helmet
______________
<class 'sklearn.svm.classes.LinearSVC'>
graphics
image
images
card
pov
files
vga
tiff
package
software
______________
<class 'sklearn.svm.classes.LinearSVC'>
motif
x
window
widget
server
xterm
mit
subject x
set
source
______________
<class 'sklearn.svm.classes.LinearSVC'>
car
cars
toyota
auto
ford
mazda
eliot
dealer
automotive
chevrolet
______________
<class 'sklearn.svm.classes.LinearSVC'>
bike
dod
motorcycle
bikes
motorcycles
riding
ride
sale
bmw