In [12]:
from sklearn.datasets import fetch_20newsgroups

In [13]:
remove = ('headers', 'footers', 'quotes')

all_categories = ['comp.windows.x', 'rec.sport.baseball', 'rec.sport.hockey']
train_bunch = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=all_categories,remove=remove)
test_bunch = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=all_categories, remove=remove)

## Стемминг

In [14]:
import nltk
from nltk.stem import *
from nltk import word_tokenize

nltk.download('punkt')

def stemming(documents: list[str]) -> list[str]:
    porter_stemmer = PorterStemmer()
    stem = []
    for document in documents:
        nltk_tokens = word_tokenize(document)
        line = ''
        for word in nltk_tokens:
            line += ' ' + porter_stemmer.stem(word)
        stem.append(line)
    return stem

train_tokenized = stemming(train_bunch.data)
test_tokenized = stemming(test_bunch.data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dimon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Методы [RF, MNB, SVM]

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC

In [16]:
stop_words = [None, 'english']
max_features_values = [100, 500, 1000, 5000, 10000]
use_idf = [True, False]

In [60]:
rf_first = range(1, 5, 1)
rf_second = range(5, 100, 20)

rf_tree_max_depth = [*rf_first, *rf_second]

parameters_rf = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__n_estimators': range(1, 10, 1),
    'clf__criterion': ('gini', 'entropy'),
    'clf__max_depth': rf_tree_max_depth
}

parameters_mnb = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__alpha': [0.1, 1, 2]
}

parameters_svm_l1 = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf
}

parameters_svm_l2 = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__loss': ['hinge', 'squared_hinge']
}

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## RF Без стемминга

In [19]:
text_clf_rf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier())])
gscv_rf = GridSearchCV(text_clf_rf, param_grid=parameters_rf, n_jobs=-1)
gscv_rf.fit(train_bunch.data, train_bunch.target)

## RF С использованием стемминга

In [20]:
text_clf_rf_stem = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', RandomForestClassifier())])
gscv_rf_stem = GridSearchCV(text_clf_rf_stem, param_grid=parameters_rf, n_jobs=-1)
gscv_rf_stem.fit(train_tokenized, train_bunch.target)

## MNB Без стемминга

In [22]:
text_clf_mnb = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
gscv_mnb = GridSearchCV(text_clf_mnb, param_grid=parameters_mnb, n_jobs=-1)
gscv_mnb.fit(train_bunch.data, train_bunch.target)

## MNB С использованием стемминга

In [26]:
text_clf_mnb_stem = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
gscv_mnb_stem = GridSearchCV(text_clf_mnb_stem, param_grid=parameters_mnb, n_jobs=-1)
gscv_mnb_stem.fit(train_tokenized, train_bunch.target)

## SVM L2 Без стемминга

In [56]:
text_clf_svm_l2 = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LinearSVC())])
gscv_svm_l2 = GridSearchCV(text_clf_svm_l2, param_grid=parameters_svm_l2, n_jobs=-1)
gscv_svm_l2.fit(train_bunch.data, train_bunch.target)



## SVM L2 С использованием стемминга

In [57]:
text_clf_svm_stem_l2 = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LinearSVC())])
gscv_svm_stem_l2 = GridSearchCV(text_clf_svm_stem_l2, param_grid=parameters_svm_l2, n_jobs=-1)
gscv_svm_stem_l2.fit(train_tokenized, train_bunch.target)



## SVM L1 Без стемминга

In [62]:
text_clf_svm_l1 = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', LinearSVC(penalty='l1', dual=False))])
gscv_svm_l1 = GridSearchCV(text_clf_svm_l1, param_grid=parameters_svm_l1, n_jobs=-1)
gscv_svm_l1.fit(train_bunch.data, train_bunch.target)

## SVM L1 С использованием стемминга

In [63]:
text_clf_svm_stem_l1 = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LinearSVC(penalty='l1', dual=False))])
gscv_svm_stem_l1 = GridSearchCV(text_clf_svm_stem_l1, param_grid=parameters_svm_l1, n_jobs=-1)
gscv_svm_stem_l1.fit(train_tokenized, train_bunch.target)

## Результаты анализа

In [64]:
from sklearn.metrics import classification_report

In [65]:
predicted_rf = gscv_rf.predict(test_bunch.data)
print('Случайный лес (RF) без стемминга\n')
print(classification_report(test_bunch.target, predicted_rf, target_names=all_categories))
print(gscv_rf.best_params_)

Случайный лес (RF) без стемминга

                    precision    recall  f1-score   support

    comp.windows.x       0.90      0.87      0.89       395
rec.sport.baseball       0.68      0.85      0.76       397
  rec.sport.hockey       0.90      0.71      0.79       399

          accuracy                           0.81      1191
         macro avg       0.83      0.81      0.81      1191
      weighted avg       0.83      0.81      0.81      1191

{'clf__criterion': 'gini', 'clf__max_depth': 65, 'clf__n_estimators': 9, 'tfidf__use_idf': False, 'vect__max_features': 5000, 'vect__stop_words': 'english'}


In [66]:
predicted_rf_stem = gscv_rf_stem.predict(test_bunch.data)
print('Случайный лес (RF) со стеммингом\n')
print(classification_report(test_bunch.target, predicted_rf_stem, target_names=all_categories))
print(gscv_rf_stem.best_params_)

Случайный лес (RF) со стеммингом

                    precision    recall  f1-score   support

    comp.windows.x       0.90      0.79      0.85       395
rec.sport.baseball       0.60      0.81      0.69       397
  rec.sport.hockey       0.84      0.64      0.73       399

          accuracy                           0.75      1191
         macro avg       0.78      0.75      0.76      1191
      weighted avg       0.78      0.75      0.76      1191

{'clf__criterion': 'gini', 'clf__max_depth': 65, 'clf__n_estimators': 9, 'tfidf__use_idf': False, 'vect__max_features': 5000, 'vect__stop_words': 'english'}


In [67]:
predicted_mnb = gscv_mnb.predict(test_bunch.data)
print('Мультиномиальный Наивный Байесовский метод (MNB) без стемминга\n')
print(classification_report(test_bunch.target, predicted_mnb, target_names=all_categories))
print(gscv_mnb.best_params_)

Мультиномиальный Наивный Байесовский метод (MNB) без стемминга

                    precision    recall  f1-score   support

    comp.windows.x       0.97      0.95      0.96       395
rec.sport.baseball       0.94      0.88      0.91       397
  rec.sport.hockey       0.89      0.95      0.92       399

          accuracy                           0.93      1191
         macro avg       0.93      0.93      0.93      1191
      weighted avg       0.93      0.93      0.93      1191

{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}


In [68]:
predicted_mnb_stem = gscv_mnb.predict(test_bunch.data)
print('Мультиномиальный Наивный Байесовский метод (MNB) со стеммингом\n')
print(classification_report(test_bunch.target, predicted_mnb_stem, target_names=all_categories))
print(gscv_mnb_stem.best_params_)

Мультиномиальный Наивный Байесовский метод (MNB) со стеммингом

                    precision    recall  f1-score   support

    comp.windows.x       0.97      0.95      0.96       395
rec.sport.baseball       0.94      0.88      0.91       397
  rec.sport.hockey       0.89      0.95      0.92       399

          accuracy                           0.93      1191
         macro avg       0.93      0.93      0.93      1191
      weighted avg       0.93      0.93      0.93      1191

{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}


In [69]:
predicted_svm_l1 = gscv_svm_l1.predict(test_bunch.data)
print('Метод опорных векторов (SVM) l1 без стемминга\n')
print(classification_report(test_bunch.target, predicted_svm_l1, target_names=all_categories))
print(gscv_svm_l1.best_params_)

Метод опорных векторов (SVM) l1 без стемминга

                    precision    recall  f1-score   support

    comp.windows.x       0.97      0.86      0.91       395
rec.sport.baseball       0.76      0.91      0.83       397
  rec.sport.hockey       0.92      0.83      0.88       399

          accuracy                           0.87      1191
         macro avg       0.88      0.87      0.87      1191
      weighted avg       0.88      0.87      0.87      1191

{'tfidf__use_idf': True, 'vect__max_features': 5000, 'vect__stop_words': 'english'}


In [70]:
predicted_svm_stem_l1 = gscv_svm_stem_l1.predict(test_bunch.data)
print('Метод опорных векторов (SVM) l1 со стеммингом\n')
print(classification_report(test_bunch.target, predicted_svm_stem_l1, target_names=all_categories))
print(gscv_svm_stem_l1.best_params_)

Метод опорных векторов (SVM) l1 со стеммингом

                    precision    recall  f1-score   support

    comp.windows.x       0.96      0.81      0.88       395
rec.sport.baseball       0.69      0.83      0.76       397
  rec.sport.hockey       0.82      0.78      0.80       399

          accuracy                           0.81      1191
         macro avg       0.82      0.81      0.81      1191
      weighted avg       0.82      0.81      0.81      1191

{'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect__stop_words': 'english'}


In [71]:
predicted_svm_l2 = gscv_svm_l2.predict(test_bunch.data)
print('Метод опорных векторов (SVM) l2 без стемминга\n')
print(classification_report(test_bunch.target, predicted_svm_l2, target_names=all_categories))
print(gscv_svm_l2.best_params_)

Метод опорных векторов (SVM) l2 без стемминга

                    precision    recall  f1-score   support

    comp.windows.x       0.98      0.94      0.96       395
rec.sport.baseball       0.86      0.91      0.88       397
  rec.sport.hockey       0.91      0.89      0.90       399

          accuracy                           0.91      1191
         macro avg       0.92      0.91      0.92      1191
      weighted avg       0.92      0.91      0.91      1191

{'clf__loss': 'squared_hinge', 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}


In [72]:
predicted_svm_stem_l2 = gscv_svm_stem_l2.predict(test_bunch.data)
print('Метод опорных векторов (SVM) l2 со стеммингом\n')
print(classification_report(test_bunch.target, predicted_svm_stem_l2, target_names=all_categories))
print(gscv_svm_stem_l2.best_params_)

Метод опорных векторов (SVM) l2 со стеммингом

                    precision    recall  f1-score   support

    comp.windows.x       0.98      0.86      0.92       395
rec.sport.baseball       0.78      0.87      0.82       397
  rec.sport.hockey       0.85      0.85      0.85       399

          accuracy                           0.86      1191
         macro avg       0.87      0.86      0.86      1191
      weighted avg       0.87      0.86      0.86      1191

{'clf__loss': 'squared_hinge', 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}


## Сравнительная таблица

In [73]:
import pandas as pd

In [74]:
df1 = pd.DataFrame(classification_report(predicted_rf, test_bunch.target, output_dict=True))
df1

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.868354,0.848866,0.714286,0.810243,0.810502,0.819151
recall,0.902632,0.68357,0.896226,0.810243,0.827476,0.810243
f1-score,0.885161,0.757303,0.794979,0.810243,0.812481,0.808157
support,380.0,493.0,318.0,0.810243,1191.0,1191.0


In [75]:
df2 = pd.DataFrame(classification_report(predicted_rf_stem, test_bunch.target, output_dict=True))
df2

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.794937,0.813602,0.64411,0.75063,0.750883,0.764459
recall,0.902299,0.602612,0.837134,0.75063,0.780681,0.75063
f1-score,0.845222,0.69239,0.728045,0.75063,0.755219,0.746237
support,348.0,536.0,307.0,0.75063,1191.0,1191.0


In [76]:
df3 = pd.DataFrame(classification_report(predicted_mnb, test_bunch.target, output_dict=True))
df3

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.949367,0.884131,0.954887,0.929471,0.929462,0.93081
recall,0.966495,0.936,0.890187,0.929471,0.930894,0.929471
f1-score,0.957854,0.909326,0.921403,0.929471,0.929528,0.929475
support,388.0,375.0,428.0,0.929471,1191.0,1191.0


In [77]:
df4 = pd.DataFrame(classification_report(predicted_mnb_stem, test_bunch.target, output_dict=True))
df4

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.949367,0.884131,0.954887,0.929471,0.929462,0.93081
recall,0.966495,0.936,0.890187,0.929471,0.930894,0.929471
f1-score,0.957854,0.909326,0.921403,0.929471,0.929528,0.929475
support,388.0,375.0,428.0,0.929471,1191.0,1191.0


In [78]:
df5 = pd.DataFrame(classification_report(predicted_svm_l1, test_bunch.target, output_dict=True))
df5

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.863291,0.914358,0.834586,0.870697,0.870745,0.875019
recall,0.96875,0.761006,0.91989,0.870697,0.883215,0.870697
f1-score,0.912985,0.830664,0.875164,0.870697,0.872938,0.86852
support,352.0,477.0,362.0,0.870697,1191.0,1191.0


In [79]:
df6 = pd.DataFrame(classification_report(predicted_svm_stem_l1, test_bunch.target, output_dict=True))
df6

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.807595,0.831234,0.784461,0.807725,0.807763,0.809564
recall,0.960843,0.694737,0.815104,0.807725,0.823561,0.807725
f1-score,0.877579,0.756881,0.799489,0.807725,0.811316,0.804264
support,332.0,475.0,384.0,0.807725,1191.0,1191.0


In [80]:
df7 = pd.DataFrame(classification_report(predicted_svm_l2, test_bunch.target, output_dict=True))
df7

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.939241,0.911839,0.892231,0.914358,0.914437,0.914138
recall,0.978892,0.85782,0.912821,0.914358,0.916511,0.914358
f1-score,0.958656,0.884005,0.902408,0.914358,0.915023,0.913787
support,379.0,422.0,390.0,0.914358,1191.0,1191.0


In [81]:
df8 = pd.DataFrame(classification_report(predicted_svm_stem_l2, test_bunch.target, output_dict=True))
df8

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.860759,0.874055,0.849624,0.861461,0.86148,0.862008
recall,0.982659,0.778027,0.849624,0.861461,0.870103,0.861461
f1-score,0.917679,0.82325,0.849624,0.861461,0.863518,0.859518
support,346.0,446.0,399.0,0.861461,1191.0,1191.0
