# Классические нейронные сети

In [None]:
#pip install pymorphy3
#pip install tensorflow==2.17.0

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from pymorphy3 import MorphAnalyzer

import warnings
warnings.filterwarnings("ignore")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Bidirectional, Dense

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.metrics import cohen_kappa_score

In [None]:
df=pd.read_excel('marked_data.xlsx')
df['labels'] = df['labels'] + 1

# **Preprosessing**

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))
morph = MorphAnalyzer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def text_preprocessing (text):
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text() #чтобы изначально удалить html-теги
    clean_text = re.sub(r'[^А-я\s]',' ',clean_text) #удаление неалфавитных символов (в том числе знаков пунктуации)
    clean_text = clean_text.lower() #приведение к нижнему регистру
    word_tokens = [morph.normal_forms(word)[0] for word in clean_text.split() if word not in stop_words] #удаление стоп-слов и лемматизация
    clean_text = ' '.join(word_tokens)
    return clean_text

In [None]:
df["clean_text"] = df["text"].map(text_preprocessing)

# **Vectorization**

In [None]:
df_train = df[:7000]
df_val = df[7000:8000]
df_test = df[8000:]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.clean_text)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print(vocab_size)

13901


In [None]:
len_comments = pd.DataFrame(list(map(lambda x: len(x.split()), df.clean_text)))

In [None]:
len_comments.median()[0]

6.0

In [None]:
print((len_comments < 30).mean()[0])

0.9503


In [None]:
print((len_comments < 100).mean()[0])

0.9949


In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.clean_text),maxlen = 30)
x_val = pad_sequences(tokenizer.texts_to_sequences(df_val.clean_text),maxlen = 30)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.clean_text),maxlen = 30)

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.labels.to_list())
y_train = encoder.transform(df_train.labels.to_list())
y_val = encoder.transform(df_val.labels.to_list())
y_test = encoder.transform(df_test.labels.to_list())
y_train = y_train.reshape(-1,1)
y_val = y_val.reshape(-1,1)
y_test = y_test.reshape(-1,1)

## **GRU**

In [None]:
embedding_dim = 32
model_GRU = Sequential()
model_GRU.add(Embedding(vocab_size, embedding_dim, input_length = 200))
model_GRU.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model_GRU.add(Dense(3, activation='softmax'))

In [None]:
model_GRU.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Обучение модели
model_GRU.fit(x_train, y_train, epochs=2, batch_size=16, validation_data=(x_val, y_val))
loss, accuracy = model_GRU.evaluate(x_val, y_val)
print(f'Accuracy: {accuracy:.2f}')

Epoch 1/2
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 47ms/step - accuracy: 0.6406 - loss: 0.8051 - val_accuracy: 0.5490 - val_loss: 1.0110
Epoch 2/2
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 43ms/step - accuracy: 0.7848 - loss: 0.5233 - val_accuracy: 0.5380 - val_loss: 1.2136
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5482 - loss: 1.1878
Accuracy: 0.54


In [None]:
print(model_GRU.summary())

None


In [None]:
predict_test = model_GRU.predict(x_test, verbose=0)
df_test_GRU = df_test.copy()
df_test_GRU['predict'] = predict_test.argmax(1).tolist()
df_test_GRU['Pneg'] = predict_test[:,0]
df_test_GRU['Pneutral'] = predict_test[:,1]
df_test_GRU['Ppos'] = predict_test[:,2]
df_test_GRU['Ppos-Pneg'] = df_test_GRU['Ppos'] - df_test_GRU['Pneg']

In [None]:
print('accuracy', accuracy_score(df_test_GRU['labels'], df_test_GRU['predict']))
print('macro_precision', precision_score(df_test_GRU['labels'], df_test_GRU['predict'], average='macro'))
print('macro_recall', recall_score(df_test_GRU['labels'], df_test_GRU['predict'], average='macro'))
print('macro_f1', f1_score(df_test_GRU['labels'], df_test_GRU['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_GRU['labels']-1, df_test_GRU['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_GRU['labels']-1, df_test_GRU['Ppos-Pneg']))

accuracy 0.532
macro_precision 0.4891426008664343
macro_recall 0.48380306413992
macro_f1 0.4855911483870467
RMSE for Ppos-Pneg 0.7953473329544067
MAE for Ppos-Pneg 0.5971473455429077


In [None]:
print('precision by class', precision_score(df_test_GRU['labels'], df_test_GRU['predict'], average=None))
print('recall by class', recall_score(df_test_GRU['labels'], df_test_GRU['predict'], average=None))
print('f1 by class', f1_score(df_test_GRU['labels'], df_test_GRU['predict'], average=None))

precision by class [0.63376111 0.45131376 0.38235294]
recall by class [0.65310275 0.47249191 0.32581454]
f1 by class [0.64328657 0.46166008 0.35182679]


In [None]:
confusion_matrix(df_test_GRU['labels'], df_test_GRU['predict'], labels=[0,1,2])

array([[642, 238, 103],
       [219, 292, 107],
       [152, 117, 130]])

In [None]:
cohen_kappa_score(df_test_GRU['labels'], df_test_GRU['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.2736877702855034

In [None]:
df_test_GRU.to_excel('Test_GRU.xlsx')

## **LSTM**

In [None]:
embedding_dim = 32
model_LSTM = Sequential()
model_LSTM.add(Embedding(vocab_size, embedding_dim, input_length = 200))
model_LSTM.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_LSTM.add(Dense(3, activation='softmax'))

In [None]:
model_LSTM.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Обучение модели
model_LSTM.fit(x_train, y_train, epochs=2, batch_size=16, validation_data=(x_val, y_val))
loss, accuracy = model_LSTM.evaluate(x_val, y_val)
print(f'Accuracy: {accuracy:.2f}')

Epoch 1/2
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 52ms/step - accuracy: 0.7876 - loss: 0.5288 - val_accuracy: 0.5240 - val_loss: 1.1180
Epoch 2/2
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 50ms/step - accuracy: 0.8533 - loss: 0.3808 - val_accuracy: 0.5290 - val_loss: 1.3103
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5295 - loss: 1.3002
Accuracy: 0.53


In [None]:
print(model_LSTM.summary())

None


In [None]:
predict_test = model_LSTM.predict(x_test, verbose=0)
df_test_LSTM = df_test.copy()
df_test_LSTM['predict'] = predict_test.argmax(1).tolist()
df_test_LSTM['Pneg'] = predict_test[:,0]
df_test_LSTM['Pneutral'] = predict_test[:,1]
df_test_LSTM['Ppos'] = predict_test[:,2]
df_test_LSTM['Ppos-Pneg'] = df_test_LSTM['Ppos'] - df_test_LSTM['Pneg']

In [None]:
print('accuracy', accuracy_score(df_test_LSTM['labels'], df_test_LSTM['predict']))
print('macro_precision', precision_score(df_test_LSTM['labels'], df_test_LSTM['predict'], average='macro'))
print('macro_recall', recall_score(df_test_LSTM['labels'], df_test_LSTM['predict'], average='macro'))
print('macro_f1', f1_score(df_test_LSTM['labels'], df_test_LSTM['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_LSTM['labels']-1, df_test_LSTM['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_LSTM['labels']-1, df_test_LSTM['Ppos-Pneg']))

accuracy 0.537
macro_precision 0.4896299390723107
macro_recall 0.4670013717474095
macro_f1 0.4700105915861204
RMSE for Ppos-Pneg 0.8050621151924133
MAE for Ppos-Pneg 0.5858951807022095


In [None]:
print('precision by class', precision_score(df_test_LSTM['labels'], df_test_LSTM['predict'], average=None))
print('recall by class', recall_score(df_test_LSTM['labels'], df_test_LSTM['predict'], average=None))
print('f1 by class', f1_score(df_test_LSTM['labels'], df_test_LSTM['predict'], average=None))

precision by class [0.60479042 0.45500849 0.40909091]
recall by class [0.71922686 0.43365696 0.2481203 ]
f1 by class [0.6570632  0.44407622 0.30889236]


In [None]:
confusion_matrix(df_test_LSTM['labels'], df_test_LSTM['predict'], labels=[0,1,2])

array([[707, 205,  71],
       [278, 268,  72],
       [184, 116,  99]])

In [None]:
cohen_kappa_score(df_test_LSTM['labels'], df_test_LSTM['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.24898917762760508

In [None]:
df_test_LSTM.to_excel('Test_LSTM.xlsx')

## CNN

In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

In [None]:
embedding_dim = 32
model_CNN = Sequential()
model_CNN.add(Embedding(vocab_size, embedding_dim, input_length = 200))
model_CNN.add(Conv1D(filters=128, kernel_size=3, activation='relu')),
model_CNN.add(GlobalMaxPooling1D()),
model_CNN.add(Dense(3, activation='softmax'))

In [None]:
model_CNN.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Обучение модели
model_CNN.fit(x_train, y_train, epochs=2, batch_size=16, validation_data=(x_val, y_val))
loss, accuracy = model_CNN.evaluate(x_val, y_val)
print(f'Accuracy: {accuracy:.2f}')

Epoch 1/2
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.5042 - loss: 1.0133 - val_accuracy: 0.5470 - val_loss: 0.9565
Epoch 2/2
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6488 - loss: 0.8134 - val_accuracy: 0.5630 - val_loss: 0.9439
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5470 - loss: 0.9593
Accuracy: 0.56


In [None]:
print(model_CNN.summary())

None


In [None]:
predict_test = model_CNN.predict(x_test, verbose=0)
df_test_CNN = df_test.copy()
df_test_CNN['predict'] = predict_test.argmax(1).tolist()
df_test_CNN['Pneg'] = predict_test[:,0]
df_test_CNN['Pneutral'] = predict_test[:,1]
df_test_CNN['Ppos'] = predict_test[:,2]
df_test_CNN['Ppos-Pneg'] = df_test_CNN['Ppos'] - df_test_CNN['Pneg']

In [None]:
print('accuracy', accuracy_score(df_test_CNN['labels'], df_test_CNN['predict']))
print('macro_precision', precision_score(df_test_CNN['labels'], df_test_CNN['predict'], average='macro'))
print('macro_recall', recall_score(df_test_CNN['labels'], df_test_CNN['predict'], average='macro'))
print('macro_f1', f1_score(df_test_CNN['labels'], df_test_CNN['predict'], average='macro'))
print('RMSE for Ppos-Pneg', root_mean_squared_error(df_test_CNN['labels']-1, df_test_CNN['Ppos-Pneg']))
print('MAE for Ppos-Pneg', mean_absolute_error(df_test_CNN['labels']-1, df_test_CNN['Ppos-Pneg']))

accuracy 0.5515
macro_precision 0.516973278102664
macro_recall 0.480066137683029
macro_f1 0.4747059124989174
RMSE for Ppos-Pneg 0.7388778328895569
MAE for Ppos-Pneg 0.5910289287567139


In [None]:
print('precision by class', precision_score(df_test_CNN['labels'], df_test_CNN['predict'], average=None))
print('recall by class', recall_score(df_test_CNN['labels'], df_test_CNN['predict'], average=None))
print('f1 by class', f1_score(df_test_CNN['labels'], df_test_CNN['predict'], average=None))

precision by class [0.62685185 0.46354167 0.46052632]
recall by class [0.68870804 0.57605178 0.1754386 ]
f1 by class [0.65632574 0.51370851 0.25408348]


In [None]:
confusion_matrix(df_test_CNN['labels'], df_test_CNN['predict'], labels=[0,1,2])

array([[677, 263,  43],
       [223, 356,  39],
       [180, 149,  70]])

In [None]:
cohen_kappa_score(df_test_CNN['labels'], df_test_CNN['predict'], labels=None, weights= 'quadratic', sample_weight=None)

0.24422600248642878

In [None]:
df_test_CNN.to_excel('Test_CNN.xlsx')

# Предсказания моделей на данных по всем комментариям

In [None]:
df_ec = pd.read_csv('ec_comments.csv')
df_cb = pd.read_csv('comments_with_cb.csv')

In [None]:
df_ec["clean_text"] = df_ec["text"].map(text_preprocessing)
df_cb["clean_text"] = df_cb["text"].map(text_preprocessing)

In [None]:
ec_vec = pad_sequences(tokenizer.texts_to_sequences(df_ec.clean_text),maxlen = 30)
cb_vec = pad_sequences(tokenizer.texts_to_sequences(df_cb.clean_text),maxlen = 30)

In [None]:
models = {'GRU': model_GRU, 'LSTM': model_LSTM, 'CNN': model_CNN}

In [None]:
for model_name in models.keys():
    m = models[model_name]
    ec = df_ec.copy()
    cb = df_cb.copy()

    predict_ec = m.predict(ec_vec)
    ec['predict'] = predict_ec.argmax(1).tolist()
    ec['Pneg'] = predict_ec[:,0]
    ec['Pneutral'] = predict_ec[:,1]
    ec['Ppos'] = predict_ec[:,2]
    ec['Ppos-Pneg'] = ec['Ppos'] - ec['Pneg']
    path = f'ec_{model_name}.csv'
    ec.to_csv(path)

    predict_cb = m.predict(cb_vec)
    cb['predict'] = predict_cb.argmax(1).tolist()
    cb['Pneg'] = predict_cb[:,0]
    cb['Pneutral'] = predict_cb[:,1]
    cb['Ppos'] = predict_cb[:,2]
    cb['Ppos-Pneg'] = cb['Ppos'] - cb['Pneg']
    path = f'cb_{model_name}.csv'
    cb.to_csv(path)

    print(model_name)

[1m6265/6265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 7ms/step
[1m2330/2330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step
GRU
[1m6265/6265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 8ms/step
[1m2330/2330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step
LSTM
[1m6265/6265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step
[1m2330/2330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
CNN
