In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

import pymorphy2

import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier


from gensim.models import Word2Vec
from gensim import corpora

In [2]:
data = pd.read_csv("dictionary_documents/text_rating_final.csv", delimiter=";", 
                   header=None, index_col=False, usecols=[0,1])
data = data.rename(columns = {0: "text", 1: "rating"})
data.dropna(inplace=True)
data = data[data['rating'] != ' совершенный']
data['rating'] = data['rating'].astype("Int8")

In [3]:
data.head(5)

Unnamed: 0,text,rating
0,"Не рациональная системность, а интуитивный поз...",0
1,"Когда возникнут трудности, они тебе не помогут...",0
2,Кривая национализация это политический компром...,-1
3,Такой вид биологического оружия не действует н...,-2
4,В Эль-Кусейре /к западу от Хомса/ сирийские по...,0


In [4]:
x_train, x_test, y_train, y_test = train_test_split(data['text'].iloc[:1000], data['rating'][:1000], test_size=0.15)

In [5]:
def tokenize(string):
    tokens = RegexpTokenizer(r'\w+').tokenize(string)
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word not in stopwords.words("russian")]
    return tokens


dictionary = corpora.Dictionary(texts)
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts] 

In [6]:

words = data['text'].iloc[:1000].apply(tokenize)

model = Word2Vec(words, vector_size=100, window=5, min_count=1, workers=4)


In [65]:
min_text_len = min(words.apply(lambda x: len(x)))
min_text_len

21

In [66]:
texts_vec = words.apply(lambda x: np.array([model.wv[word] for word in x[:min_text_len]]).flatten())

In [67]:
texts_vec

0       [0.004977012, 0.007866973, 0.00787337, -0.0040...
1       [0.008886005, 0.0016663035, 0.0033848323, 0.00...
2       [0.00588346, -0.0020739017, 0.0050357673, 0.00...
3       [0.0014542402, 0.010155585, 0.0011152382, -0.0...
4       [0.0032784594, 0.009981124, 0.008115054, -0.00...
                              ...                        
1004    [0.0020395524, 0.00018774561, 0.0030498386, -0...
1005    [-0.0044261706, 0.0076282057, -0.0025058396, 0...
1006    [-0.00946207, 0.008142503, 0.009441727, 0.0012...
1007    [-0.008628812, 0.007820256, -0.005420964, -0.0...
1008    [0.0015959925, -0.0038259947, 0.010128955, -0....
Name: text, Length: 1000, dtype: object

In [68]:
w2v_x_train, w2v_x_test, w2v_y_train, w2v_y_test = train_test_split(texts_vec, data['rating'][:1000], test_size=0.15)

In [74]:
w2v_x_train = np.vstack(w2v_x_train)


In [75]:
clf = SGDClassifier().fit(w2v_x_train, w2v_y_train)

In [76]:
predicted = clf.predict(np.vstack(w2v_x_test))
score = (predicted == w2v_y_test).sum()/len(predicted)
print(score)

0.35333333333333333


In [None]:
# Токенизация и векторизация
count_vect = CountVectorizer(lowercase=True, stop_words=stopwords.words("russian"))
x_train_tokens = count_vect.fit_transform(x_train)
x_test_tokens = count_vect.transform(x_test)
#tfidf_transformer = TfidfTransformer()
#x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
#x_test_tfidf = tfidf_transformer.transform(x_test_counts)

In [30]:
classifiers = {"Наивный байесовский классификатор": MultinomialNB(), "Метод опорных векторов":SGDClassifier(), 
               "Дерево решений": DecisionTreeClassifier(), "Ансамблевый метод(Бустинг)": GradientBoostingClassifier()}
vectorizers = {"Мешок слов": CountVectorizer(lowercase=True, stop_words=stopwords.words("russian")), 
               "TF-IDF": TfidfVectorizer(lowercase=True, stop_words=stopwords.words("russian"))}
scores = {}
for v_name, vectorizer in vectorizers.items():
    train_transformed = vectorizer.fit_transform(x_train)
    print(train_transformed)
    test_transformed = vectorizer.transform(x_test)
    scores[v_name] = {}
    for c_name, classifier in classifiers.items():
        classifier.fit(train_transformed, y_train)
        predicted = classifier.predict(test_transformed)
        score = (predicted == y_test).sum()/len(predicted)
        scores[v_name][c_name] = score
        print(f"Метод векторизации - {v_name}. Классификатор - {c_name}, Score - {score}")
print(scores)

  (0, 27942)	2
  (0, 23673)	1
  (0, 16369)	1
  (0, 27524)	1
  (0, 22121)	1
  (0, 11458)	1
  (0, 192)	1
  (0, 5334)	3
  (0, 15191)	1
  (0, 14645)	1
  (0, 22283)	1
  (0, 9678)	1
  (0, 22733)	1
  (0, 17405)	1
  (0, 1556)	1
  (0, 5237)	1
  (0, 14448)	1
  (0, 19448)	1
  (0, 24802)	1
  (0, 19163)	1
  (0, 4556)	1
  (0, 11806)	1
  (0, 17225)	1
  (0, 4434)	1
  (0, 6557)	1
  :	:
  (849, 26621)	1
  (849, 24231)	3
  (849, 8682)	1
  (849, 6683)	1
  (849, 6638)	1
  (849, 23580)	1
  (849, 19307)	1
  (849, 4156)	1
  (849, 213)	2
  (849, 25506)	1
  (849, 16929)	1
  (849, 21107)	1
  (849, 13791)	1
  (849, 41)	1
  (849, 393)	1
  (849, 8750)	1
  (849, 342)	1
  (849, 23614)	1
  (849, 20815)	1
  (849, 23641)	1
  (849, 6433)	1
  (849, 2562)	1
  (849, 18680)	1
  (849, 1199)	1
  (849, 417)	1
Метод векторизации - Мешок слов. Классификатор - Наивный байесовский классификатор, Score - 0.4866666666666667
Метод векторизации - Мешок слов. Классификатор - Метод опорных векторов, Score - 0.47333333333333333
Метод вект


KeyboardInterrupt



In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(10)
a = np.arange(len(scores['TF-IDF']))
width = 0.1

names = classifiers.keys()

ax.bar(a - width/2, scores['TF-IDF'].values(), label = 'TF-IDF')
ax.bar(a + width/2, scores['Мешок слов'].values(), label = 'Мешок слов')
ax.set_xticklabels(names)
ax.set_xticks(a)
ax.legend()

Тональный словарь

In [None]:
words = pd.read_csv("dictionary_documents/words_all_full_rating.csv", delimiter=";")
words.head(5)
words.set_index("Words")

In [None]:
texts = x_train.iloc[0].apply(tokenize)
