In [5]:
from eli5 import show_weights, explain_prediction
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

import os
import pickle

PROJ_PATH = r'F:\tmp\data science\UnnParserBot'
# CLASSES   = ['економіка', 'кримінал', 'культура', 'міжнародні новини', 'позитив', 'політика', 'спорт', 'суспільство', 'технології']
CLASSES   = ['економіка', 'кримінал', 'міжнародні новини', 'політика', 'спорт', 'суспільство']

## LogisticRegression

In [2]:
# Loading the dataset, collecting the data
all_ds = load_npz(os.path.join(PROJ_PATH, 'src', 'class_data.npz')).tocsr()

# Loading the target
with open(os.path.join(PROJ_PATH, 'src', 'class_headings.hd'), 'rb') as f:
    headings = pickle.load(f)

# Loading vectorizers to obtain feature names
with open(os.path.join(PROJ_PATH, 'src', 'class_tfidf_w.vct'), 'rb') as f:
    tfidf_w = pickle.load(f)
    
with open(os.path.join(PROJ_PATH, 'src', 'class_tfidf_ch.vct'), 'rb') as f:
    tfidf_ch = pickle.load(f)

# Collecting feature names
tfidf_w_labels = [k for k,v in sorted(list(tfidf_w.vocabulary_.items()), key=lambda x: x[1])]
tfidf_ch_labels = [k for k,v in sorted(list(tfidf_ch.vocabulary_.items()), key=lambda x: x[1])]
orig_features = tfidf_w_labels + tfidf_ch_labels

train_ds, test_ds, y_train, y_test = train_test_split(all_ds,
                                                      headings,
                                                      test_size=0.2,
                                                      stratify=headings,
                                                      random_state=42)

le = LabelEncoder()
le.fit(CLASSES)

LabelEncoder()

In [3]:
def get_prediction_stats(target_test, target_pred=None):
    """
    Prints a map <column_name> -> number of samples.
    If target_pred is provided also gives (<right_predictions>/total) stats.
    """
    
    if target_pred is None:
        print(pd.Series(le.inverse_transform(target_test)).value_counts().sort_index())
    else:
        unique_columns = target_test.unique()
        for column in unique_columns:
            c_pred = len([y for y, yy in zip(target_test, target_pred) if y == column == yy])
            value_counts = target_test.value_counts()[column]
            percentage = (c_pred/value_counts)*100
            print(f'{le.inverse_transform([column])[0]:17s}: {value_counts:5d}, correctly predicted {c_pred:4d} ({percentage:.3f}%)')

In [8]:
# Finding the best parameters using KFold cross validation
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
classifier = LogisticRegression(solver='saga', C=1)
results_skf = cross_val_score(classifier, all_ds, headings, cv=skf)
print(f'Accuracy: {results_skf.mean()*100.0:.2f}%')

Accuracy: 82.66%


In [12]:
# Training the model
classifier = LogisticRegression(solver='saga', C=1)
log_reg = classifier.fit(train_ds, y_train)

In [13]:
# Predicting
log_predictions = log_reg.predict(test_ds)

In [14]:
# Evaluating with f1 macro score (without class_weight='balanced')
print(f'Test evaluation: {f1_score(y_test, log_predictions, average="weighted")}')
print(f'Train evaluation: {f1_score(y_train, log_reg.predict(train_ds), average="weighted")}')

Test evaluation: 0.8247289726768079
Train evaluation: 0.8952804069375224


In [15]:
# Evaluating with f1 macro score (with class_weight='balanced')
print(f'Test evaluation: {f1_score(y_test, log_predictions, average="weighted")}')
print(f'Train evaluation: {f1_score(y_train, log_reg.predict(train_ds), average="weighted")}')

Test evaluation: 0.8195540320305105
Train evaluation: 0.8794511080550723


In [15]:
print(classification_report(y_test, log_predictions, zero_division=True, target_names=CLASSES))

                   precision    recall  f1-score   support

        економіка       0.78      0.69      0.73       522
         кримінал       0.85      0.88      0.86      1319
міжнародні новини       0.81      0.87      0.84       881
         політика       0.83      0.85      0.84      2076
            спорт       0.99      0.97      0.98       451
      суспільство       0.75      0.66      0.71       951

         accuracy                           0.83      6200
        macro avg       0.83      0.82      0.83      6200
     weighted avg       0.82      0.83      0.82      6200



In [16]:
get_prediction_stats(pd.Series(y_test), target_pred=log_predictions)

суспільство      :   951, correctly predicted  632 (66.456%)
політика         :  2076, correctly predicted 1771 (85.308%)
кримінал         :  1319, correctly predicted 1159 (87.870%)
спорт            :   451, correctly predicted  436 (96.674%)
економіка        :   522, correctly predicted  360 (68.966%)
міжнародні новини:   881, correctly predicted  766 (86.947%)


In [21]:
show_weights(log_reg, 
             target_names=CLASSES,
             feature_names=orig_features,
             top=15)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+2.543,нбу,,,,
+1.606,ціна,,,,
+1.495,ачс,,,,
+1.428,україна,,,,
+1.350,банк,,,,
+1.310,мхп,,,,
+1.299,курятина,,,,
+1.243,бан,,,,
+1.222,ринок,,,,
+1.212,млн,,,,

Weight?,Feature
+2.543,нбу
+1.606,ціна
+1.495,ачс
+1.428,україна
+1.350,банк
+1.310,мхп
+1.299,курятина
+1.243,бан
+1.222,ринок
+1.212,млн

Weight?,Feature
+1.876,писати унн
+1.745,обшук
+1.581,викрити
+1.524,сбу
+1.519,статися
+1.513,дснс
+1.499,лайнер
+1.433,унн повідомити
+1.369,підозра
+1.352,журналіст унн

Weight?,Feature
+2.282,унн посилання
+2.123,країна
+2.077,писати унн
+1.879,рф
… 9177 more positive …,… 9177 more positive …
… 10809 more negative …,… 10809 more negative …
-1.773,украї
-1.778,ук
-1.806,укра
-1.814,ук

Weight?,Feature
+2.287,зеленський
+2.266,україна
+1.816,читати
+1.594,кримський
+1.512,рад
+1.475,передавати кореспондент
+1.461,вр
+1.432,гончарук
+1.392,вказати
+1.294,29

Weight?,Feature
+2.113,унн посилання
+2.043,клуб
+1.911,примітно
+1.837,уєфа
+1.734,поєдинок
+1.728,матч
+1.681,турнір
+1.613,фк
+1.613,інформувати
+1.604,динамо

Weight?,Feature
+2.850,кий
+2.217,столичний
+1.823,фільм
+1.765,кмда
+1.629,відзначати
+1.525,свято
+1.523,повідомлятися унн
+1.504,управління поліція
+1.418,моз
+1.308,петиція


In [22]:
explain_prediction(log_reg,
                   test_ds.toarray()[6],
                   target_names=CLASSES,
                   feature_names=orig_features,
                   top=15)

Contribution?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Contribution?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Contribution?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Contribution?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Contribution?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Contribution?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+0.124,посилання даний,,,,
+0.113,україна,,,,
+0.092,даний,,,,
+0.082,повідомляти посилання,,,,
+0.069,теплий,,,,
+0.053,унн повідомляти,,,,
+0.049,стан,,,,
+0.047,кінець,,,,
… 532 more positive …,… 532 more positive …,,,,
… 561 more negative …,… 561 more negative …,,,,

Contribution?,Feature
+0.124,посилання даний
+0.113,україна
+0.092,даний
+0.082,повідомляти посилання
+0.069,теплий
+0.053,унн повідомляти
+0.049,стан
+0.047,кінець
… 532 more positive …,… 532 more positive …
… 561 more negative …,… 561 more negative …

Contribution?,Feature
+0.199,вночі
+0.134,область
+0.081,решта
+0.076,територія
+0.074,узбережжя
… 604 more positive …,… 604 more positive …
… 489 more negative …,… 489 more negative …
-0.053,градус тепла
-0.054,південь
-0.057,кінець

Contribution?,Feature
+0.471,<BIAS>
+0.162,країна
+0.158,північний
+0.080,північ
+0.078,південь
+0.067,градус
… 535 more positive …,… 535 more positive …
… 558 more negative …,… 558 more negative …
-0.062,ук
-0.068,ук

Contribution?,Feature
+0.679,<BIAS>
+0.179,україна
+0.120,30
+0.095,північний
+0.063,море
… 383 more positive …,… 383 more positive …
… 710 more negative …,… 710 more negative …
-0.057,вдень
-0.062,посилання
-0.066,північ

Contribution?,Feature
+0.042,посилання
+0.037,місце
+0.032,30
+0.030,емп
… 416 more positive …,… 416 more positive …
… 677 more negative …,… 677 more negative …
-0.029,вде
-0.029,вихідний
-0.029,вден
-0.031,унн повідомляти

Contribution?,Feature
+0.442,<BIAS>
+0.243,градус
+0.218,температура
+0.205,вдень
+0.176,вночі
+0.145,укргідрометцентру
+0.133,°
+0.109,криму
+0.102,опадів
+0.097,тепла


In [24]:
# Saving the model
# sklearn models are not JSON serializable unfortunately
with open(os.path.join(PROJ_PATH, 'src', 'class_model_log.md'), 'wb') as f:
    pickle.dump(log_reg, f)