# Модели классического машинного обучения

In [2]:
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from pymorphy3 import MorphAnalyzer

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.metrics import cohen_kappa_score

In [3]:
df=pd.read_excel('marked_data.xlsx')
df['labels'] = df['labels'] + 1

In [4]:
df_ec = pd.read_csv('ec_comments.csv')
df_cb = pd.read_csv('comments_with_cb.csv')

## Предобработка текста (Text preprocessing)

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))
morph = MorphAnalyzer()

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [7]:
def text_preprocessing (text):
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text() #чтобы изначально удалить html-теги
    clean_text = re.sub(r'[^А-я\s]',' ',clean_text) #удаление неалфавитных символов (в том числе знаков пунктуации)
    clean_text = clean_text.lower() #приведение к нижнему регистру
    word_tokens = [morph.normal_forms(word)[0] for word in clean_text.split() if word not in stop_words] #удаление стоп-слов и лемматизация
    clean_text = ' '.join(word_tokens)
    return clean_text

In [8]:
%time df["clean_text"] = df["text"].map(text_preprocessing)

CPU times: total: 3.78 s
Wall time: 8.69 s


In [9]:
df_train = df[:7000]
df_val = df[7000:8000]
df_test = df[8000:]

### Naive Bayes

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
%%time
vectorizer = CountVectorizer()
X_train_NB = vectorizer.fit_transform(df_train['clean_text'])
y_train_NB = df_train['labels']

X_val_NB = vectorizer.transform(df_test['clean_text'])
y_val_NB = df_train['labels']

X_test_NB = vectorizer.transform(df_test['clean_text'])
y_test_NB = df_test['labels']

CPU times: total: 0 ns
Wall time: 80.2 ms


In [13]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
%time mnb.fit(X_train_NB, y_train_NB)

CPU times: total: 0 ns
Wall time: 2.36 ms


In [14]:
%%time
df_test_mnb = df_test.copy()
df_test_mnb['predict'] = mnb.predict(X_test_NB)
y_pred_test_proba_mnb = mnb.predict_proba(X_test_NB)
df_test_mnb['Pneg'] = list(map(lambda x: x[0], y_pred_test_proba_mnb))
df_test_mnb['Pneutral'] = list(map(lambda x: x[1], y_pred_test_proba_mnb))
df_test_mnb['Ppos'] = list(map(lambda x: x[2], y_pred_test_proba_mnb))
df_test_mnb['Ppos-Pneg'] = df_test_mnb['Ppos'] - df_test_mnb['Pneg']

CPU times: total: 0 ns
Wall time: 3.48 ms


In [15]:
print('accuracy', accuracy_score(df_test_mnb['labels'], df_test_mnb['predict']))
print('macro_precision', precision_score(df_test_mnb['labels'], df_test_mnb['predict'], average='macro'))
print('macro_recall', recall_score(df_test_mnb['labels'], df_test_mnb['predict'], average='macro'))
print('macro_f1', f1_score(df_test_mnb['labels'], df_test_mnb['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_mnb['labels']-1, df_test_mnb['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_mnb['labels']-1, df_test_mnb['Ppos-Pneg']))

accuracy 0.4875
macro_precision 0.44237391621542765
macro_recall 0.3669832010387421
macro_f1 0.3317671069276428
RMSE for Ppos-Pneg 0.829511611686629
MAE for Ppos-Pneg 0.6129509306042501


In [16]:
print('precision by class', precision_score(df_test_mnb['labels'], df_test_mnb['predict'], average=None))
print('recall by class', recall_score(df_test_mnb['labels'], df_test_mnb['predict'], average=None))
print('f1 by class', f1_score(df_test_mnb['labels'], df_test_mnb['predict'], average=None))

precision by class [0.50382128 0.36585366 0.45744681]
recall by class [0.87182096 0.12135922 0.10776942]
f1 by class [0.63859911 0.18226002 0.17444219]


In [17]:
confusion_matrix(df_test_mnb['labels'], df_test_mnb['predict'], labels=[0,1,2])

array([[857,  90,  36],
       [528,  75,  15],
       [316,  40,  43]], dtype=int64)

In [18]:
cohen_kappa_score(df_test_mnb['labels'], df_test_mnb['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.07036459934099071

In [19]:
df_test_mnb.to_excel('Test_mnb.xlsx')

## Векторизация для других классчиеских ML моделей

In [21]:
%%time
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(df_train['clean_text'])
y_train = df_train['labels']

X_val = tfidf_vectorizer.transform(df_val['clean_text'])
y_val = df_val['labels']

X_test = tfidf_vectorizer.transform(df_test['clean_text'])
y_test = df_test['labels']

CPU times: total: 31.2 ms
Wall time: 86.7 ms


### Logistic Regression Classifier

In [23]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
%time lr.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 241 ms


In [24]:
lr = LogisticRegression(random_state=42)
%time lr.fit(X_train, y_train)

df_test_lr = df_test.copy()
df_test_lr['predict'] = lr.predict(X_test)
y_pred_test_proba_lr = lr.predict_proba(X_test)
df_test_lr['Pneg'] = list(map(lambda x: x[0], y_pred_test_proba_lr))
df_test_lr['Pneutral'] = list(map(lambda x: x[1], y_pred_test_proba_lr))
df_test_lr['Ppos'] = list(map(lambda x: x[2], y_pred_test_proba_lr))
df_test_lr['Ppos-Pneg'] = df_test_lr['Ppos'] - df_test_lr['Pneg']

CPU times: total: 0 ns
Wall time: 204 ms


In [25]:
print('accuracy', accuracy_score(df_test_lr['labels'], df_test_lr['predict']))
print('macro_precision', precision_score(df_test_lr['labels'], df_test_lr['predict'], average='macro'))
print('macro_recall', recall_score(df_test_lr['labels'], df_test_lr['predict'], average='macro'))
print('macro_f1', f1_score(df_test_lr['labels'], df_test_lr['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_lr['labels']-1, df_test_lr['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_lr['labels']-1, df_test_lr['Ppos-Pneg']))

accuracy 0.538
macro_precision 0.5319936012351312
macro_recall 0.43839615714347685
macro_f1 0.43476161323829793
RMSE for Ppos-Pneg 0.7362317838378271
MAE for Ppos-Pneg 0.6293217202479642


In [26]:
print('precision by class', precision_score(df_test_lr['labels'], df_test_lr['predict'], average=None))
print('recall by class', recall_score(df_test_lr['labels'], df_test_lr['predict'], average=None))
print('f1 by class', f1_score(df_test_lr['labels'], df_test_lr['predict'], average=None))

precision by class [0.55849582 0.46475771 0.57272727]
recall by class [0.81586979 0.34142395 0.15789474]
f1 by class [0.66308392 0.39365672 0.2475442 ]


In [27]:
confusion_matrix(df_test_lr['labels'], df_test_lr['predict'], labels=[0,1,2])

array([[802, 161,  20],
       [380, 211,  27],
       [254,  82,  63]], dtype=int64)

In [28]:
cohen_kappa_score(df_test_lr['labels'], df_test_lr['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.18927051062027767

In [29]:
df_test_lr.to_excel('Test_lr.xlsx')

### SVC

In [31]:
from sklearn.svm import SVC

svc = SVC(random_state=42, probability=True, kernel = 'linear')
%time svc.fit(X_train, y_train)

CPU times: total: 5.09 s
Wall time: 17.9 s


In [32]:
%%time
df_test_svc = df_test.copy()
df_test_svc['predict'] = svc.predict(X_test)
y_pred_test_proba_svc = svc.predict_proba(X_test)
df_test_svc['Pneg'] = list(map(lambda x: x[0], y_pred_test_proba_svc))
df_test_svc['Pneutral'] = list(map(lambda x: x[1], y_pred_test_proba_svc))
df_test_svc['Ppos'] = list(map(lambda x: x[2], y_pred_test_proba_svc))
df_test_svc['Ppos-Pneg'] = df_test_svc['Ppos'] - df_test_svc['Pneg']

CPU times: total: 328 ms
Wall time: 1.26 s


In [33]:
print('accuracy', accuracy_score(df_test_svc['labels'], df_test_svc['predict']))
print('macro_precision', precision_score(df_test_svc['labels'], df_test_svc['predict'], average='macro'))
print('macro_recall', recall_score(df_test_svc['labels'], df_test_svc['predict'], average='macro'))
print('macro_f1', f1_score(df_test_svc['labels'], df_test_svc['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_svc['labels']-1, df_test_svc['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_svc['labels']-1, df_test_svc['Ppos-Pneg']))

accuracy 0.537
macro_precision 0.5294213721202442
macro_recall 0.4442834106438321
macro_f1 0.44494951954417566
RMSE for Ppos-Pneg 0.7396651279320561
MAE for Ppos-Pneg 0.6332129654269467


In [34]:
print('precision by class', precision_score(df_test_svc['labels'], df_test_svc['predict'], average=None))
print('recall by class', recall_score(df_test_svc['labels'], df_test_svc['predict'], average=None))
print('f1 by class', f1_score(df_test_svc['labels'], df_test_svc['predict'], average=None))

precision by class [0.55937053 0.46300211 0.56589147]
recall by class [0.79552391 0.35436893 0.18295739]
f1 by class [0.65686686 0.40146654 0.27651515]


In [35]:
confusion_matrix(df_test_svc['labels'], df_test_svc['predict'], labels=[0,1,2])

array([[782, 170,  31],
       [374, 219,  25],
       [242,  84,  73]], dtype=int64)

In [36]:
cohen_kappa_score(df_test_svc['labels'], df_test_svc['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.19507505867418484

In [37]:
df_test_svc.to_excel('Test_svc.xlsx')

### Random forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1, min_samples_split = 2)
%time rf.fit(X_train, y_train)

CPU times: total: 1min 31s
Wall time: 5.4 s


In [40]:
%%time
df_test_rf = df_test.copy()
df_test_rf['predict'] = rf.predict(X_test)
y_pred_test_proba_rf = rf.predict_proba(X_test)
df_test_rf['Pneg'] = y_pred_test_proba_rf[:, 0]
df_test_rf['Pneutral'] = y_pred_test_proba_rf[:, 1]
df_test_rf['Ppos'] = y_pred_test_proba_rf[:, 2]
df_test_rf['Ppos-Pneg'] = df_test_rf['Ppos'] - df_test_rf['Pneg']

CPU times: total: 219 ms
Wall time: 372 ms


In [41]:
print('accuracy', accuracy_score(df_test_rf['labels'], df_test_rf['predict']))
print('macro_precision', precision_score(df_test_rf['labels'], df_test_rf['predict'], average='macro'))
print('macro_recall', recall_score(df_test_rf['labels'], df_test_rf['predict'], average='macro'))
print('macro_f1', f1_score(df_test_rf['labels'], df_test_rf['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_rf['labels']-1, df_test_rf['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_rf['labels']-1, df_test_rf['Ppos-Pneg']))

accuracy 0.5415
macro_precision 0.5226986291387822
macro_recall 0.45533791300871
macro_f1 0.44716363327765646
RMSE for Ppos-Pneg 0.7440814179503622
MAE for Ppos-Pneg 0.6204618951434312


In [42]:
print('precision by class', precision_score(df_test_rf['labels'], df_test_rf['predict'], average=None))
print('recall by class', recall_score(df_test_rf['labels'], df_test_rf['predict'], average=None))
print('f1 by class', f1_score(df_test_rf['labels'], df_test_rf['predict'], average=None))

precision by class [0.58871627 0.45937962 0.52      ]
recall by class [0.73245168 0.50323625 0.13032581]
f1 by class [0.65276519 0.48030888 0.20841683]


In [43]:
confusion_matrix(df_test_rf['labels'], df_test_rf['predict'], labels=[0,1,2])

array([[720, 245,  18],
       [277, 311,  30],
       [226, 121,  52]], dtype=int64)

In [44]:
cohen_kappa_score(df_test_rf['labels'], df_test_rf['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.19528315882348335

In [45]:
df_test_rf.to_excel('Test_rf.xlsx')

### XGBoost

In [47]:
from xgboost import XGBClassifier

In [48]:
xgb_cl = XGBClassifier(n_estimators = 800, learning_rate = 0.15, max_depth = 7)
%time xgb_cl.fit(X_train, y_train)

CPU times: total: 4min 10s
Wall time: 15.1 s


In [49]:
%%time
df_test_xgb = df_test.copy()
df_test_xgb['predict'] = xgb_cl.predict(X_test)
y_pred_test_proba_xgb = xgb_cl.predict_proba(X_test)
df_test_xgb['Pneg'] = y_pred_test_proba_xgb[:, 0]
df_test_xgb['Pneutral'] = y_pred_test_proba_xgb[:, 1]
df_test_xgb['Ppos'] = y_pred_test_proba_xgb[:, 2]
df_test_xgb['Ppos-Pneg'] = df_test_xgb['Ppos'] - df_test_xgb['Pneg']

CPU times: total: 844 ms
Wall time: 77.7 ms


In [50]:
print('accuracy', accuracy_score(df_test_xgb['labels'], df_test_xgb['predict']))
print('macro_precision', precision_score(df_test_xgb['labels'], df_test_xgb['predict'], average='macro'))
print('macro_recall', recall_score(df_test_xgb['labels'], df_test_xgb['predict'], average='macro'))
print('macro_f1', f1_score(df_test_xgb['labels'], df_test_xgb['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_xgb['labels']-1, df_test_xgb['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_xgb['labels']-1, df_test_xgb['Ppos-Pneg']))

accuracy 0.54
macro_precision 0.5030925731883156
macro_recall 0.47170942282602907
macro_f1 0.4717059349799652
RMSE for Ppos-Pneg 0.7487475389685869
MAE for Ppos-Pneg 0.5956262961197645


In [51]:
print('precision by class', precision_score(df_test_xgb['labels'], df_test_xgb['predict'], average=None))
print('recall by class', recall_score(df_test_xgb['labels'], df_test_xgb['predict'], average=None))
print('f1 by class', f1_score(df_test_xgb['labels'], df_test_xgb['predict'], average=None))

precision by class [0.6101083  0.45531915 0.44385027]
recall by class [0.68769074 0.51941748 0.20802005]
f1 by class [0.64658058 0.48526077 0.28327645]


In [52]:
confusion_matrix(df_test_xgb['labels'], df_test_xgb['predict'], labels=[0,1,2])

array([[676, 256,  51],
       [244, 321,  53],
       [188, 128,  83]], dtype=int64)

In [53]:
cohen_kappa_score(df_test_xgb['labels'], df_test_xgb['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.23473776328386786

In [54]:
df_test_xgb.to_excel('Test_xgb.xlsx')

## Предсказания моделей на данных по всем комментариям

In [56]:
df_ec["clean_text"] = df_ec["text"].map(text_preprocessing)
df_cb["clean_text"] = df_cb["text"].map(text_preprocessing)

In [57]:
ec_mnb = vectorizer.transform(df_ec['clean_text'])
cb_mnb = vectorizer.transform(df_cb['clean_text'])

In [58]:
ec = df_ec.copy()
cb = df_cb.copy()

ec['predict'] = mnb.predict(ec_mnb)
y_pred_proba_ec = mnb.predict_proba(ec_mnb)
ec['Pneg'] = list(map(lambda x: x[0], y_pred_proba_ec))
ec['Pneutral'] = list(map(lambda x: x[1], y_pred_proba_ec))
ec['Ppos'] = list(map(lambda x: x[2], y_pred_proba_ec))
ec['Ppos-Pneg'] = ec['Ppos'] - ec['Pneg']
ec.to_csv(f'ec_mnb.csv')

cb['predict'] = mnb.predict(cb_mnb)
y_pred_proba_cb = mnb.predict_proba(cb_mnb)
cb['Pneg'] = list(map(lambda x: x[0], y_pred_proba_cb))
cb['Pneutral'] = list(map(lambda x: x[1], y_pred_proba_cb))
cb['Ppos'] = list(map(lambda x: x[2], y_pred_proba_cb))
cb['Ppos-Pneg'] = cb['Ppos'] - cb['Pneg']
cb.to_csv(f'cb_mnb.csv')

In [59]:
ec_tfidf = tfidf_vectorizer.transform(df_ec['clean_text'])
cb_tfidf = tfidf_vectorizer.transform(df_cb['clean_text'])

In [60]:
models = {'lr': lr, 'svc': svc, 'rf': rf, 'xgb': xgb_cl}

In [61]:
for model_name in models.keys():
    model = models[model_name]
    ec = df_ec.copy()
    cb = df_cb.copy()
    
    ec['predict'] = model.predict(ec_tfidf)
    y_pred_proba_ec = model.predict_proba(ec_tfidf)
    ec['Pneg'] = list(map(lambda x: x[0], y_pred_proba_ec))
    ec['Pneutral'] = list(map(lambda x: x[1], y_pred_proba_ec))
    ec['Ppos'] = list(map(lambda x: x[2], y_pred_proba_ec))
    ec['Ppos-Pneg'] = ec['Ppos'] - ec['Pneg']
    path = f'ec_{model_name}.csv'
    ec.to_csv(path)

    cb['predict'] = model.predict(cb_tfidf)
    y_pred_proba_cb = model.predict_proba(cb_tfidf)
    cb['Pneg'] = list(map(lambda x: x[0], y_pred_proba_cb))
    cb['Pneutral'] = list(map(lambda x: x[1], y_pred_proba_cb))
    cb['Ppos'] = list(map(lambda x: x[2], y_pred_proba_cb))
    cb['Ppos-Pneg'] = cb['Ppos'] - cb['Pneg']
    path = f'cb_{model_name}.csv'
    cb.to_csv(path)
    
    print(model_name)

lr
svc
rf
xgb
