In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import time
import re

from sklearn.metrics import classification_report

from bs4 import BeautifulSoup             


from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Tools for creating ngrams and vectorizing input data
from gensim.models import Word2Vec, Phrases

# Configs
pd.options.display.float_format = '{:,.4f}'.format
sns.set(style="whitegrid")
seed = 42
np.random.seed(seed)

In [16]:
df = pd.read_csv("task-3-dataset.csv") # Считываем данные

In [17]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer()
# Функция загрузки стопслов
def downloads_():
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords
# Функция обработки текта
def foo(review, morph):
    # Обработка текста отзыва. Оставляем только буквы, приводим к нижнему регистру
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^а-яА-Я]', ' ', review)
    review = review.lower()
    # Отделяем слова
    review = review.split()
    # Избавляемся от стоп-слов(предлоги,союзы, частицы, не несущие семантической нагрузки)
    review = [word for word in review if not word in set(stopwords.words('russian'))]
    # Лемматизируем(для русского языка в явном виде нет, но пока тк)
    # В явном виде лемматизации нет для русского языка,  SNOWBALL STEMMER как вариант
    lemmatized_words = [morph.normal_forms(word)[0] for word in review]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [18]:
# Разделим данные, пока нет тестовых
from sklearn.model_selection import train_test_split
dataset_train, dataset_test, train_data_label, test_data_label = train_test_split(df['отзывы'], df['разметка'], test_size=0.2, random_state=42)
train_data_label

150    -
96     +
200    +
68     +
156    -
      ..
106    -
14     -
92     -
179    -
102    +
Name: разметка, Length: 168, dtype: object

In [19]:
# Сформируем тестовый и трейновый словари
corpus_train = []
corpus_test  = []

downloads_()
for i in range(dataset_train.shape[0]):
    review = dataset_train.iloc[i]
    review = foo(review, morph)
    corpus_train.append(review)

for j in range(dataset_test.shape[0]):
    review = dataset_test.iloc[j]
    review = foo(review, morph)
    corpus_test.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fayne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# Векторизуем с помощью TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3))

tfidf_vec_train = tfidf_vec.fit_transform(corpus_train)
tfidf_vec_test = tfidf_vec.transform(corpus_test)


In [21]:
# Обучаем
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(C=0.5, random_state=42)
linear_svc.fit(tfidf_vec_train, train_data_label)

predict = linear_svc.predict(tfidf_vec_test)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Classification Report: \n", classification_report(test_data_label, predict,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict))
print("Accuracy: \n", accuracy_score(test_data_label, predict))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.93      0.93      0.93        28
    Positive       0.86      0.86      0.86        14

    accuracy                           0.90        42
   macro avg       0.89      0.89      0.89        42
weighted avg       0.90      0.90      0.90        42

Confusion Matrix: 
 [[26  2]
 [ 2 12]]
Accuracy: 
 0.9047619047619048


In [23]:
# Векторизуем с помощью другого векторизатора   
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train = count_vec.fit_transform(corpus_train)
count_vec_test = count_vec.transform(corpus_test)

linear_svc_count = LinearSVC(C=0.5, random_state=42, max_iter=5000)
linear_svc_count.fit(count_vec_train, train_data_label)
predict_count = linear_svc_count.predict(count_vec_test)

print("Classification Report: \n", classification_report(test_data_label, predict_count,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_count))
print("Accuracy: \n", accuracy_score(test_data_label, predict_count))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.90      0.68      0.78        28
    Positive       0.57      0.86      0.69        14

    accuracy                           0.74        42
   macro avg       0.74      0.77      0.73        42
weighted avg       0.79      0.74      0.75        42

Confusion Matrix: 
 [[19  9]
 [ 2 12]]
Accuracy: 
 0.7380952380952381


In [24]:
# И снова другой векторизатор
ind_vec = CountVectorizer(ngram_range=(1, 3), binary=True)
ind_vec_train = ind_vec.fit_transform(corpus_train)
ind_vec_test = ind_vec.transform(corpus_test)

linear_svc_ind = LinearSVC(C=0.5, random_state=42)
linear_svc_ind.fit(ind_vec_train, train_data_label)
predict_ind = linear_svc_ind.predict(ind_vec_test)

print("Classification Report: \n", classification_report(test_data_label, predict_ind,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_ind))
print("Accuracy: \n", accuracy_score(test_data_label, predict_ind))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.90      0.68      0.78        28
    Positive       0.57      0.86      0.69        14

    accuracy                           0.74        42
   macro avg       0.74      0.77      0.73        42
weighted avg       0.79      0.74      0.75        42

Confusion Matrix: 
 [[19  9]
 [ 2 12]]
Accuracy: 
 0.7380952380952381


In [25]:
# TF_IDF дал лучший результат, используем его, добаим наивный байесовский классификатор
tfidf_vec_NB = TfidfVectorizer(ngram_range=(1, 1))
tfidf_vec_train_NB = tfidf_vec_NB.fit_transform(corpus_train)

tfidf_vec_test_NB = tfidf_vec_NB.transform(corpus_test)

print(tfidf_vec_train_NB.toarray().shape, tfidf_vec_test_NB.toarray().shape)

(168, 762) (42, 762)


In [26]:
from sklearn.feature_selection import SelectKBest, chi2

ch2 = SelectKBest(chi2, k=50000)
tfidf_vec_train_NB = ch2.fit_transform(tfidf_vec_train_NB, train_data_label)
tfidf_vec_test_NB  = ch2.transform(tfidf_vec_test_NB)



In [27]:
feature_names = tfidf_vec_NB.get_feature_names_out()
feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
feature_names = np.asarray(feature_names)

from sklearn.naive_bayes import MultinomialNB
multi_clf = MultinomialNB()
multi_clf.fit(tfidf_vec_train_NB, train_data_label)
predict_NB = multi_clf.predict(tfidf_vec_test_NB)

print("Classification Report: \n", classification_report(test_data_label, predict_NB,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_NB))
print("Accuracy: \n", accuracy_score(test_data_label, predict_NB))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.93      0.96      0.95        28
    Positive       0.92      0.86      0.89        14

    accuracy                           0.93        42
   macro avg       0.93      0.91      0.92        42
weighted avg       0.93      0.93      0.93        42

Confusion Matrix: 
 [[27  1]
 [ 2 12]]
Accuracy: 
 0.9285714285714286


In [28]:
count_vec_NB = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train_NB = count_vec_NB.fit_transform(corpus_train)
count_vec_test_NB = count_vec_NB.transform(corpus_test)

multi_clf_count = MultinomialNB()
multi_clf_count.fit(count_vec_train_NB, train_data_label)
predict_NB_count = multi_clf_count.predict(count_vec_test_NB)

print("Classification Report: \n", classification_report(test_data_label, predict_NB_count,target_names=['Negative','Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_data_label, predict_NB_count))
print("Accuracy: \n", accuracy_score(test_data_label, predict_NB_count))

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.93      0.89      0.91        28
    Positive       0.80      0.86      0.83        14

    accuracy                           0.88        42
   macro avg       0.86      0.88      0.87        42
weighted avg       0.88      0.88      0.88        42

Confusion Matrix: 
 [[25  3]
 [ 2 12]]
Accuracy: 
 0.8809523809523809


In [29]:
# Попробуем теперь LSTM НАХУЙ
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Masking, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tfds

# ***ВАРИАНТ РЕШЕНИЯ С LSTM***

In [52]:
max_features = 20000
maxlen = 200
tokenizer = Tokenizer(num_words=max_features)
df = pd.read_csv('task-3-dataset.csv')
df["разметка"].loc[df["разметка"]=="+"]=1
df["разметка"].loc[df["разметка"]=="-"]=0
X_data = df.drop(['разметка'],axis=1)
y_data = df['отзывы']
train, test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
train.columns = ['отзывы']
test.columns = ['отзывы']

In [53]:
tokenizer.fit_on_texts(train['отзывы'])
X_train_token = tokenizer.texts_to_sequences(train['отзывы'])
tokenizer.fit_on_texts(test['отзывы'])
X_test_token = tokenizer.texts_to_sequences(test['отзывы'])

In [54]:
X_train = pad_sequences(X_train_token, maxlen=maxlen, padding='post')
X_test  = pad_sequences(X_test_token, maxlen=maxlen, padding='post')

In [55]:
y_train = train_data_label.copy()
y_train = y_train.replace('-',0)
y_train = y_train.replace('+',1)
y_test  = test_data_label.copy()
y_test = y_test.replace('-',0)
y_test = y_test.replace('+',1)

In [56]:
model = Sequential([Embedding(max_features, 64, mask_zero=True),
                    Bidirectional(LSTM(64, dropout=0.2)),
                    Dense(64, activation='sigmoid'),
                    Dense(1)])

In [57]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [62]:
history = model.fit(X_train, y_train,
                    batch_size=50,
                    epochs=10,
                    validation_data=(X_test, y_test))

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.4426 - loss: 8.9847 - val_accuracy: 0.3333 - val_loss: 10.7454
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.4392 - loss: 9.0384 - val_accuracy: 0.3333 - val_loss: 10.7454
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.4312 - loss: 9.1674 - val_accuracy: 0.3333 - val_loss: 10.7454
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 0.4426 - loss: 8.9847 - val_accuracy: 0.3333 - val_loss: 10.7454
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 0.4586 - loss: 8.7268 - val_accuracy: 0.3333 - val_loss: 10.7454
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.4599 - loss: 8.7053 - val_accuracy: 0.3333 - val_loss: 10.7454
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━