In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
revs = pd.DataFrame.from_csv("./res.csv", encoding="cp1251")

initial_revs = revs.copy()
revs

Unnamed: 0,rating,content
0,5,"Интересный сюжет, затягивает. Много времени ух..."
1,4,Плюсы: \r\n1. Игра интересная 2. Образы учител...
2,3,"Огорчил тот факт , что суть игры заключается в..."
3,5,"Была постоянная ошибка с подключением, почти н..."
4,1,... сделайте русскую озвучку и будет 5 звёзд :...
5,3,Почему нет русского языка?
6,1,!
7,1,Постоянные проблемы с подключением!!!
8,5,Все идеально но отсутствия русского языка даёт...
9,1,Где великий и могучий??? Сделайте русский


Удаляем неподходящие отзывы: с рейтингов 3-4, с английским текстом, пустые

In [4]:
    #drop eng reviews
revs.drop(revs[revs.content.str.contains ("[A-Za-z]")].index, inplace=True)
    #drop reviews with rating 3-4
revs.drop(revs[(revs.rating == 3) | (revs.rating == 4)].index, inplace=True)
    #drop empty reviews
revs.drop(revs[revs.content == ""].index, inplace=True)

revs

Unnamed: 0,rating,content
0,5,"Интересный сюжет, затягивает. Много времени ух..."
4,1,... сделайте русскую озвучку и будет 5 звёзд :...
6,1,!
7,1,Постоянные проблемы с подключением!!!
8,5,Все идеально но отсутствия русского языка даёт...
9,1,Где великий и могучий??? Сделайте русский
11,2,Добавьте русский язык пожалуйста(
16,1,"В описании приложения, написали что есть русск..."
17,1,Пожалуйста сделайте русский язык!!!!!
18,2,"Зашла в игру, поиграла ровно минуту и всё, эне..."


Заменяем оценку 5 на класс "1" - позитивные отзывы, оценки 1-2 - на класс "0" - негативные отзывы

In [5]:
revs['rating'][(revs.rating == 1) | (revs.rating == 2)] = 0
revs['rating'][revs.rating == 5] = 1

revs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,rating,content
0,1,"Интересный сюжет, затягивает. Много времени ух..."
4,0,... сделайте русскую озвучку и будет 5 звёзд :...
6,0,!
7,0,Постоянные проблемы с подключением!!!
8,1,Все идеально но отсутствия русского языка даёт...
9,0,Где великий и могучий??? Сделайте русский
11,0,Добавьте русский язык пожалуйста(
16,0,"В описании приложения, написали что есть русск..."
17,0,Пожалуйста сделайте русский язык!!!!!
18,0,"Зашла в игру, поиграла ровно минуту и всё, эне..."


Загружаем морфологический анализатор pymorphy2, стоп-слова из списка библиотеки nltk

In [6]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
stop_words = {"быть", "вдруг" "вы", "всегда", "все", "другой", "если", "есть", "здесь", "зачем", "иногда", "конечно", "между", "наконец", "раз", "разве", "только", "тот", "уже", "хоть", "чуть", "этот", "это", "я", "ты", "он", "оно", "каждый", "весь", "это", "ваш", "пока", "свой", "вообще"}
#stop_words = set(stopwords.words('russian'))

Лемматизация

In [7]:
def strtonormalform(line, should_delete = {'PREP', 'CONJ', 'PRCL'}):
    ans = ''
    words = re.findall(r"[\w]+[\w|-]*", line)
    for word in words:
        lexInfo = morph.parse(word)[0]
        lex = lexInfo.normal_form
        if (lexInfo.tag.POS not in should_delete) and (re.match(r"[-+]?\d+$", lex) is None):#союзы, предлоги и т.п. не сохраняем
            ans = ans + ' ' + lex
    return ans

#revs_norm = revs.copy()
#revs_norm['content'] = revs_norm['content'].apply(strtonormalform)
    #drop empty reviews
#revs_norm.drop(revs_norm[revs_norm.content == ""].index, inplace=True)

#revs_norm



In [8]:
#revs_norm.to_csv("normalized_revs.csv")
revs_norm = pd.DataFrame.from_csv("./normalized_revs.csv", encoding="cp1251")
revs_norm

Unnamed: 0,rating,content
0,1,интересный сюжет затягивать много время уходи...
4,0,сделать русский озвучка быть звезда спасибо п...
7,0,постоянный проблема подключение
8,1,весь идеально отсутствие русский язык давать ...
9,0,где великий могучий сделать русский
11,0,добавить русский язык
16,0,описание приложение написать есть русский язы...
17,0,сделать русский язык
18,0,зайти игра поиграть ровно минута весь энергия...
22,1,почему игра запускаться перекидывать страница...


In [9]:
def strtonormalform_with_neg(line, should_delete = {'PREP', 'CONJ', 'PRCL'}):
    ans = ''
    words = re.findall(r"[\w]+[\w|-]*", line)
    flag = False
    for word in words:
        if (word in ["не", "Не", "без", "Без"]):
            flag = True
        lexInfo = morph.parse(word)[0]
        lex = lexInfo.normal_form
        if (lexInfo.tag.POS not in should_delete) and (re.match(r"[-+]?\d+$", lex) is None):#союзы, предлоги и т.п. не сохраняем
            if (flag):
                flag = False
                lex = "не_" + lex
            ans = ans + ' ' + lex
    return ans

#revs_norm_neg = revs.copy()
#revs_norm_neg['content'] = revs['content'].apply(strtonormalform_with_neg)
    #drop empty reviews
#revs_norm_neg.drop(revs_norm_neg[revs_norm_neg.content == ""].index, inplace=True)

#revs_norm_neg



In [10]:
#revs_norm_neg.to_csv("normalized_revs_with_neg.csv")
revs_norm_neg = pd.DataFrame.from_csv("./normalized_revs_with_neg.csv", encoding="cp1251")
revs_norm_neg

Unnamed: 0,rating,content
0,1,интересный сюжет затягивать много время уходи...
4,0,сделать русский озвучка быть звезда спасибо п...
7,0,постоянный проблема подключение
8,1,весь идеально отсутствие русский язык давать ...
9,0,где великий могучий сделать русский
11,0,добавить русский язык
16,0,описание приложение написать есть русский язы...
17,0,сделать русский язык
18,0,зайти игра поиграть ровно минута весь энергия...
22,1,почему игра не_запускаться перекидывать стран...


In [11]:
def strtonormalform_with_neg_stop(line, should_delete = {'PREP', 'CONJ', 'PRCL'}):
    ans = ''
    words = re.findall(r"[\w]+[\w|-]*", line)
    flag = False
    for word in words:
        if (word in ["не", "Не", "без", "Без"]):
            flag = True
        lexInfo = morph.parse(word)[0]
        lex = lexInfo.normal_form
        if (lexInfo.tag.POS not in should_delete) and (re.match(r"[-+]?\d+$", lex) is None) and (lex not in stop_words):#союзы, предлоги и т.п. не сохраняем
            if (flag):
                flag = False
                lex = "не_" + lex
            ans = ans + ' ' + lex
        if (lexInfo.tag.POS is should_delete) or (lex in stop_words):
            flag = False
    return ans

#revs_norm_neg_stop = revs.copy()
#revs_norm_neg_stop['content'] = revs['content'].apply(strtonormalform_with_neg_stop)
    #drop empty reviews
#revs_norm_neg_stop.drop(revs_norm_neg_stop[revs_norm_neg_stop.content == ""].index, inplace=True)

#revs_norm_neg_stop



In [12]:
#revs_norm_neg_stop.to_csv("normalized_revs_with_neg_stop.csv")
#revs_norm_neg_stop = pd.DataFrame.from_csv("./normalized_revs_with_neg_stop.csv", encoding="cp1251")
#revs_norm_neg_stop

In [13]:
def strtonormalform_with_neg_stop(line, should_delete = {'PREP', 'CONJ', 'PRCL'}):
    ans = ''
    words = re.findall(r"[\w]+[\w|-]*", line)
    flag = False
    for word in words:
        lexInfo = morph.parse(word)[0]
        lex = lexInfo.normal_form
        if (lex not in stop_words):#союзы, предлоги и т.п. не сохраняем
            ans = ans + ' ' + lex
    return ans

revs_norm_neg_stop = revs_norm_neg.copy()
revs_norm_neg_stop['content'] = revs_norm_neg['content'].apply(strtonormalform_with_neg_stop)
    #drop empty reviews
revs_norm_neg_stop.drop(revs_norm_neg_stop[revs_norm_neg_stop.content == ""].index, inplace=True)

revs_norm_neg_stop



Unnamed: 0,rating,content
0,1,интересный сюжет затягивать много время уходи...
4,0,сделать русский озвучка звезда спасибо понимание
7,0,постоянный проблема подключение
8,1,идеально отсутствие русский язык давать себя ...
9,0,где великий могучий сделать русский
11,0,добавить русский язык
16,0,описание приложение написать русский язык фак...
17,0,сделать русский язык
18,0,зайти игра поиграть ровно минута энергия конч...
22,1,почему игра не_запускаться перекидывать стран...


In [14]:
revs_norm_neg_stop.rating.describe()

count    15557.000000
mean         0.515009
std          0.499791
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: rating, dtype: float64

In [15]:
print ('Количество положительных отзывов составляет {}% от всей выборки'.format(round(float(sum(revs_norm_neg_stop.rating)) / len(revs_norm_neg_stop.rating) * 100, 2)))
print ('Количество положительных отзывов составляет {}% от всей выборки'.format(round(float(len(revs_norm_neg_stop.rating )- sum(revs_norm_neg_stop.rating)) / len(revs_norm_neg_stop.rating) * 100, 2)))
print ('Размер обучающей выборки: {}'.format(revs_norm_neg_stop.shape[0]))

Количество положительных отзывов составляет 51.5% от всей выборки
Количество положительных отзывов составляет 48.5% от всей выборки
Размер обучающей выборки: 15557


In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.model_selection import RandomizedSearchCV
from nltk.corpus import stopwords
from sklearn.model_selection import cross_validate 
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import string



In [17]:
import sklearn.metrics

def trainTestClf(clf, X_train, y_train, X_test):
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

def get_metrics(y_test, y_pred):
    accuracy = sklearn.metrics.accuracy_score(y_true = y_test, y_pred = y_pred)
    precision = sklearn.metrics.precision_score(y_true = y_test, y_pred = y_pred)
    recall = sklearn.metrics.recall_score(y_true = y_test, y_pred = y_pred)
    f1_score = sklearn.metrics.f1_score(y_true = y_test, y_pred = y_pred)
    
    return {
        "accuracy" : accuracy,
        "precision" : precision,
        "recall" : recall,
        "f1_score" : f1_score
    }


def print_metrics(metrics, vType ='-', cType='-'):
    print("Vectorizing: ", vType, ". Classifier: ", cType, sep='')

    print("accuracy = %.3f" % metrics["accuracy"])
    print("precision = %.3f" % metrics["precision"])
    print("recall = %.3f" % metrics["recall"])
    print("F-мера = %.3f" % metrics["f1_score"])
    
    

In [18]:
clf_svm = LinearSVC() 
clf_nb = MultinomialNB()
clf_rf = RandomForestClassifier()
clf_lr = LogisticRegression(C=1e5)

scoring = ['accuracy', 'precision', 'recall', 'f1']

In [19]:
norm_texts = revs_norm.content.copy()
norm_texts_splitted = revs_norm.content.str.split().copy()

norm_texts_neg = revs_norm_neg.content.copy()
norm_texts_neg_splitted = revs_norm_neg.content.str.split().copy()

norm_texts_neg_stop = revs_norm_neg_stop.content.copy()
norm_texts_neg_stop_splitted = revs_norm_neg_stop.content.str.split().copy()

In [20]:
#from sklearn.feature_extraction.text import TfidfVectorizer

#tiv = TfidfVectorizer()
#nwd = tiv.fit_transform(norm_texts)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer_uni = CountVectorizer(binary = True, ngram_range=(1,1), max_features= 10000)
count_vectorizer_bi = CountVectorizer(binary = True, ngram_range=(2,2), max_features= 10000)
count_vectorizer_tri = CountVectorizer(binary = True, ngram_range=(3,3), max_features= 10000)
count_vectorizer_uni_bi = CountVectorizer(binary = True, ngram_range=(1,2), max_features= 10000)
count_vectorizer_uni_bi_tri = CountVectorizer(binary = True, ngram_range=(1,3), max_features= 10000)

cvect_uni = count_vectorizer_uni.fit_transform(norm_texts)
cvect_bi = count_vectorizer_bi.fit_transform(norm_texts)
cvect_tri = count_vectorizer_tri.fit_transform(norm_texts)
cvect_uni_bi = count_vectorizer_uni_bi.fit_transform(norm_texts)
cvect_uni_bi_tri = count_vectorizer_uni_bi_tri.fit_transform(norm_texts)

In [22]:
# без доп признаков

# униграммы
scores1_uni = pd.DataFrame(cross_validate(clf_svm, cvect_uni, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_uni = pd.DataFrame(cross_validate(clf_rf, cvect_uni, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_uni = pd.DataFrame(cross_validate(clf_lr, cvect_uni, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_uni = pd.DataFrame(cross_validate(clf_nb, cvect_uni, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))

# биграммы
scores1_bi = pd.DataFrame(cross_validate(clf_svm, cvect_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_bi = pd.DataFrame(cross_validate(clf_rf, cvect_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_bi = pd.DataFrame(cross_validate(clf_lr, cvect_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_bi = pd.DataFrame(cross_validate(clf_nb, cvect_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))

# триграммы
scores1_tri = pd.DataFrame(cross_validate(clf_svm, cvect_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_tri = pd.DataFrame(cross_validate(clf_rf, cvect_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_tri = pd.DataFrame(cross_validate(clf_lr, cvect_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_tri = pd.DataFrame(cross_validate(clf_nb, cvect_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))

# уни- и биграммы
scores1_uni_bi = pd.DataFrame(cross_validate(clf_svm, cvect_uni_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_uni_bi = pd.DataFrame(cross_validate(clf_rf, cvect_uni_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_uni_bi = pd.DataFrame(cross_validate(clf_lr, cvect_uni_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_uni_bi = pd.DataFrame(cross_validate(clf_nb, cvect_uni_bi, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))

# уни-, би-, и триграммы
scores1_uni_bi_tri = pd.DataFrame(cross_validate(clf_svm, cvect_uni_bi_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_uni_bi_tri = pd.DataFrame(cross_validate(clf_rf, cvect_uni_bi_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_uni_bi_tri = pd.DataFrame(cross_validate(clf_lr, cvect_uni_bi_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_uni_bi_tri = pd.DataFrame(cross_validate(clf_nb, cvect_uni_bi_tri, revs_norm.rating, scoring=scoring, cv=5, return_train_score=False))

In [23]:
# без доп признаков

# униграммы
#X_train, X_test, y_train, y_test = train_test_split(cvect_uni, revs.rating, test_size=0.1, random_state=42)
#metr1_uni = get_metrics(y_test, trainTestClf(clf_svm, X_train, y_train, X_test))
#metr2_uni = get_metrics(y_test, trainTestClf(clf_rf, X_train, y_train, X_test))
#metr3_uni = get_metrics(y_test, trainTestClf(clf_lr, X_train, y_train, X_test))


In [24]:
class_res_svm = pd.DataFrame({
    "униграммы" : scores1_uni.mean(),
    "биграммы" : scores1_bi.mean(),
    "триграммы" : scores1_tri.mean(),
    "уни- и биграммы" : scores1_uni_bi.mean(),
    "уни-, би- и триграммы" : scores1_uni_bi_tri.mean()
})

print("Результаты для svm")
class_res_svm

Результаты для svm


Unnamed: 0,биграммы,триграммы,уни- и биграммы,"уни-, би- и триграммы",униграммы
fit_time,0.244209,0.543112,0.789617,0.479634,0.531864
score_time,0.007998,0.005017,0.006003,0.007005,0.029662
test_accuracy,0.753115,0.59752,0.830914,0.82995,0.826802
test_f1,0.78488,0.710565,0.838606,0.837511,0.834596
test_precision,0.712318,0.564395,0.824917,0.824644,0.821275
test_recall,0.874645,0.959086,0.854442,0.852196,0.849825


In [25]:
class_res_rf = pd.DataFrame({
    "униграммы" : scores2_uni.mean(),
    "биграммы" : scores2_bi.mean(),
    "триграммы" : scores2_tri.mean(),
    "уни- и биграммы" : scores2_uni_bi.mean(),
    "уни-, би- и триграммы" : scores2_uni_bi_tri.mean()
})
print("Результаты для random forest")
class_res_rf

Результаты для random forest


Unnamed: 0,биграммы,триграммы,уни- и биграммы,"уни-, би- и триграммы",униграммы
fit_time,4.0372,4.356153,3.054008,2.715535,3.385429
score_time,0.528569,0.784167,0.106501,0.09366,0.104117
test_accuracy,0.742386,0.590453,0.820635,0.826482,0.820442
test_f1,0.778224,0.707593,0.818175,0.824112,0.819672
test_precision,0.699435,0.559676,0.855275,0.860986,0.847863
test_recall,0.878015,0.961955,0.784712,0.790949,0.793943


In [26]:
class_res_lr = pd.DataFrame({
    "униграммы" : scores3_uni.mean(),
    "биграммы" : scores3_bi.mean(),
    "триграммы" : scores3_tri.mean(),
    "уни- и биграммы" : scores3_uni_bi.mean(),
    "уни-, би- и триграммы" : scores3_uni_bi_tri.mean()
})


print("Результаты для linear regression")
class_res_lr

Результаты для linear regression


Unnamed: 0,биграммы,триграммы,уни- и биграммы,"уни-, би- и триграммы",униграммы
fit_time,0.667465,0.173455,1.373482,1.314812,4.504603
score_time,0.00843,0.00864,0.009668,0.005213,0.009
test_accuracy,0.73956,0.601889,0.788707,0.788642,0.760182
test_f1,0.769004,0.711756,0.79948,0.799798,0.77137
test_precision,0.707826,0.567608,0.783207,0.781412,0.758143
test_recall,0.84259,0.954221,0.818644,0.820639,0.786461


In [27]:
class_res_nb = pd.DataFrame({
    "униграммы" : scores4_uni.mean(),
    "биграммы" : scores4_bi.mean(),
    "триграммы" : scores4_tri.mean(),
    "уни- и биграммы" : scores4_uni_bi.mean(),
    "уни-, би- и триграммы" : scores4_uni_bi_tri.mean()
})


print("Результаты для naive bayes")
class_res_nb

Результаты для naive bayes


Unnamed: 0,биграммы,триграммы,уни- и биграммы,"уни-, би- и триграммы",униграммы
fit_time,0.015411,0.010001,0.014658,0.013245,0.02069
score_time,0.009407,0.010001,0.008424,0.009041,0.005225
test_accuracy,0.78543,0.588655,0.86053,0.860402,0.847745
test_f1,0.809759,0.708639,0.86216,0.861669,0.849848
test_precision,0.745056,0.557955,0.876102,0.878092,0.861947
test_recall,0.887491,0.97106,0.8497,0.846832,0.839346


In [28]:
# результаты для всех классификаторов
scores_uni_bi = pd.DataFrame({
    "SVM" : scores1_uni_bi.mean(),
    "Random forest" : scores2_uni_bi.mean(),
    "Linear regression" : scores3_uni_bi.mean(),
    "Naive Bayes" : scores4_uni_bi.mean()
})

scores_uni_bi

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.373482,0.014658,3.054008,0.789617
score_time,0.009668,0.008424,0.106501,0.006003
test_accuracy,0.788707,0.86053,0.820635,0.830914
test_f1,0.79948,0.86216,0.818175,0.838606
test_precision,0.783207,0.876102,0.855275,0.824917
test_recall,0.818644,0.8497,0.784712,0.854442


Сочетание униграмм и биграм показывает лучшие результаты, будем использовать данный способ векторизации

Посмотрим на метрики классификации в текстах с обработкой отрицаний

In [29]:
cvect_uni_bi_neg = count_vectorizer_uni_bi.fit_transform(norm_texts_neg)


In [30]:


# уни- и биграммы
scores1_uni_bi_neg = pd.DataFrame(cross_validate(clf_svm, cvect_uni_bi_neg, revs_norm_neg.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_uni_bi_neg = pd.DataFrame(cross_validate(clf_rf, cvect_uni_bi_neg, revs_norm_neg.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_uni_bi_neg = pd.DataFrame(cross_validate(clf_lr, cvect_uni_bi_neg, revs_norm_neg.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_uni_bi_neg = pd.DataFrame(cross_validate(clf_nb, cvect_uni_bi_neg, revs_norm_neg.rating, scoring=scoring, cv=5, return_train_score=False))


In [31]:
scores_uni_bi_neg = pd.DataFrame({
    "SVM" : scores1_uni_bi_neg.mean(),
    "Random forest" : scores2_uni_bi_neg.mean(),
    "Linear regression" : scores3_uni_bi_neg.mean(),
    "Naive Bayes" : scores4_uni_bi_neg.mean()
})

scores_uni_bi_neg

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.373652,0.016804,2.672456,0.477643
score_time,0.007997,0.007019,0.094701,0.003838
test_accuracy,0.792689,0.872351,0.835154,0.840871
test_f1,0.802789,0.872937,0.833355,0.848583
test_precision,0.788065,0.893557,0.868201,0.832206
test_recall,0.819389,0.854316,0.801928,0.866789


In [32]:
from scipy import sparse

Удалим стоп-слова

In [33]:
cvect_uni_bi_neg_stop = count_vectorizer_uni_bi.fit_transform(norm_texts_neg_stop)


# уни- и биграммы
scores1_uni_bi_neg_stop = pd.DataFrame(cross_validate(clf_svm, cvect_uni_bi_neg_stop, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_uni_bi_neg_stop = pd.DataFrame(cross_validate(clf_rf, cvect_uni_bi_neg_stop, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_uni_bi_neg_stop = pd.DataFrame(cross_validate(clf_lr, cvect_uni_bi_neg_stop, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_uni_bi_neg_stop = pd.DataFrame(cross_validate(clf_nb, cvect_uni_bi_neg_stop, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))


In [34]:
scores_uni_bi_neg_stop = pd.DataFrame({
    "SVM" : scores1_uni_bi_neg_stop.mean(),
    "Random forest" : scores2_uni_bi_neg_stop.mean(),
    "Linear regression" : scores3_uni_bi_neg_stop.mean(),
    "Naive Bayes" : scores4_uni_bi_neg_stop.mean()
})

scores_uni_bi_neg_stop

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.388208,0.016621,2.733623,0.38452
score_time,0.0072,0.004422,0.100525,0.004472
test_accuracy,0.789033,0.865271,0.835574,0.838594
test_f1,0.799947,0.866454,0.835286,0.846284
test_precision,0.782154,0.883463,0.861383,0.830912
test_recall,0.820524,0.851104,0.811665,0.863585


In [35]:
revs_norm_neg_stop = revs_norm_neg
cvect_uni_bi_neg_stop = cvect_uni_bi_neg
norm_texts_neg_stop = norm_texts_neg
norm_texts_neg_stop_splitted = norm_texts_neg_splitted

In [36]:
# Нахождение неверно классифицированных отзывов

def print_wrong_class(y_test, y_pred):
    inds = y_test[y_test.values != y_pred].index
    i = 0
    for idx, rev in revs_norm_neg_stop.iterrows():
        i+=1
        if (idx in inds):
            print (idx, "\tclass=", rev.rating, '\n', rev.content, '\n\n', initial_revs.ix[idx].content, "\n")
            

In [37]:
cvect_uni_bi_neg_stop.shape

(15566, 10000)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(cvect_uni_bi_neg_stop, revs_norm_neg_stop.rating, test_size=0.1, random_state=42)
classifier = clf_svm.fit(X=X_train, y=y_train)
y_pred = classifier.predict(X_test)

print_wrong_class (y_test, y_pred)

18 	class= 0 
  зайти игра поиграть ровно минута весь энергия кончиться нужно ждать несколько часы пока восстановиться шикарно 

 Зашла в игру, поиграла ровно минуту и всё, энергия кончилась, нужно ждать несколько часов пока восстановится. Шикарно ? 

58 	class= 0 
  описание игра русский язык указанный фактически таковой игра нет добавить игра убрать описание 

 В описании к игре Русский язык указан, фактически такового в игре нет
Добавьте в игру или уберите из описания 

63 	class= 1 
  крутяга ещё русский добавить сложно постоянно нужно вспоминать английский 

 Крутяг
Ещё бы русский добавили
А то сложно ,постоянно нужно вспоминать английский ))) 

76 	class= 1 
  добавить русский язык игра супер график сожаление я не_мочь она играть потому не_понимать они я французский знать английски плохо 

 Добавьте русский язык пожалуйста, игра просто супер и графика! Но к сожалению я не могу в неё играть потому что не понимаю что они говорят((( Я французский знаю, а английски плохо((( 

161 	cl

Считываем словарь тональностей из файла

In [39]:
sent = pd.read_csv('words_sent.csv', sep=';', header=None, names =['word', 'sent', '1', '2', '3'])

In [40]:
Dict = sent.groupby('word')['sent'].mean().to_dict()

In [149]:
tone = np.zeros((len(norm_texts_neg_stop), 4))

In [42]:
Dict.get('ужасный', 0)

-1.25

Определяем общее значение тональности по словарю

In [150]:
# нормальзованное значение тональности
for i, text in enumerate(norm_texts_neg_stop_splitted):
    count = 0
    for j, word in enumerate(text):        
        neg = False
        if (word.startswith("не_")):
            s = Dict.get(word[3:], 0)
            neg = True
        else:
            s = Dict.get(word, 0)

        if s != 0:
            count += 1
            if (neg):
                tone[i, 0] += -s
            else:
                tone[i, 0] += s
        if (s == 0 and neg):
            tone[i, 0] += -0.3
    if count:
        tone[i, 0] = tone[i, 0]/count

In [151]:
for i in range(tone.shape[1]):
    tone[:, i] = tone[:, i] - tone[:, i].min()
    print (i, tone[:, i].max())
    tone[:, i] = tone[:, i] /tone[:, i].max()

0 4.16666666667
1 0.0
2 0.0
3 0.0




In [45]:
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 0].reshape(tone[:, 0].shape[0], 1))))

In [46]:
scores1_tone_dict = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_tone_dict = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_tone_dict = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_tone_dict = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))


In [47]:
scores_tone_dict = pd.DataFrame({
    "SVM" : scores1_tone_dict.mean(),
    "Random forest" : scores2_tone_dict.mean(),
    "Linear regression" : scores3_tone_dict.mean(),
    "Naive Bayes" : scores4_tone_dict.mean()
})

scores_tone_dict

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.27133,0.019102,3.215584,0.628848
score_time,0.009064,0.003401,0.116686,0.006821
test_accuracy,0.802132,0.874022,0.843186,0.84145
test_f1,0.810888,0.875742,0.83942,0.848292
test_precision,0.799683,0.888172,0.885489,0.836271
test_recall,0.824005,0.864544,0.798938,0.86205


Ненормализованная разметка тональности работает лучше

In [48]:
# количество слов

for i, text in enumerate(norm_texts_neg_stop_splitted):
    tone[i, 1] = len(text)

for i in range(tone.shape[1]):
    tone[:, i] = tone[:, i] - tone[:, i].min()
    tone[:, i] = tone[:, i] /tone[:, i].max()

In [49]:
#train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 0:1])))
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 1].reshape(tone[:, 1].shape[0], 1))))

scores1_words_num = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_words_num = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_words_num = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_words_num = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))


In [50]:
scores_words_num = pd.DataFrame({
    "SVM" : scores1_words_num.mean(),
    "Random forest" : scores2_words_num.mean(),
    "Linear regression" : scores3_words_num.mean(),
    "Naive Bayes" : scores4_words_num.mean()
})

scores_words_num

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,2.411731,0.017053,3.039663,0.606985
score_time,0.024215,0.0064,0.10324,0.006709
test_accuracy,0.794359,0.872351,0.834384,0.840807
test_f1,0.803998,0.872937,0.82958,0.848531
test_precision,0.790651,0.893557,0.881212,0.832095
test_recall,0.819265,0.854316,0.784466,0.866789


Признак ухудшает качество классификации для метода Случайного леса, для остальных не меняет - плохой признак, не будем включать

In [51]:
#class_result_svm = class_result_svm.append(pd.Series(metr1, name="+ количество слов"))
#class_result_rf = class_result_rf.append(pd.Series(metr2, name="+ количество слов"))
#class_result_lr = class_result_lr.append(pd.Series(metr3, name="+ количество слов"))

#print("---svm---", class_result_svm)
#print("---lr---", class_result_rf)
#print("---rf---", class_result_lr)

Считаем количество восклицательных знаков

In [152]:
initial_left_revs = initial_revs.drop(initial_revs.index.difference(revs_norm_neg_stop.index))
initial_left_revs['rating'] = revs_norm_neg_stop['rating']

for i, text in enumerate(initial_left_revs.content):
    num = 0
    for e in text:
        if e == '!':
            num += 1
            #tone[i, 1] = 1
    tone[i, 1] = num

In [153]:
for i in range(tone.shape[1]):
    tone[:, i] = tone[:, i] - tone[:, i].min()
    print(i, tone[:, i].max())
    tone[:, i] = tone[:, i] /tone[:, i].max()

0 1.0
1 262.0
2 nan
3 nan


In [54]:
#train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 0:1])))
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 1].reshape(tone[:, 1].shape[0], 1))))

scores1_excl_num = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_excl_num = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_excl_num = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_excl_num = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))


In [55]:
scores_excl_num = pd.DataFrame({
    "SVM" : scores1_excl_num.mean(),
    "Random forest" : scores2_excl_num.mean(),
    "Linear regression" : scores3_excl_num.mean(),
    "Naive Bayes" : scores4_excl_num.mean()
})

scores_excl_num

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.702181,0.019076,3.034394,0.82794
score_time,0.006456,0.005763,0.1241,0.005403
test_accuracy,0.793203,0.872416,0.834448,0.840293
test_f1,0.803341,0.873012,0.832831,0.848176
test_precision,0.788294,0.893572,0.866008,0.831115
test_recall,0.820262,0.85444,0.802926,0.867163


норм, оставим

Считаем количество глаголов в повелительном наклонении (встречаются часто в отрицательных отзывах)

In [154]:
def getImperatives(line, should_delete = {'PREP', 'CONJ', 'PRCL', 'INTJ'}):
    ans = 0
    words = re.findall(r"[\w]+[\w|-]*", line)
    for word in words:
        lexInfo = morph.parse(word)[0]
        if (lexInfo.tag.POS == 'VERB'):
            if (lexInfo.tag.mood == 'impr'):
                ans += 1
    if (ans > 0):
        return ans
    else:
        return 0
    return ans

initial_left_revs['imps'] = [getImperatives(line) for line in initial_left_revs.content]


In [57]:
initial_revs

Unnamed: 0,rating,content
0,5,"Интересный сюжет, затягивает. Много времени ух..."
1,4,Плюсы: \r\n1. Игра интересная 2. Образы учител...
2,3,"Огорчил тот факт , что суть игры заключается в..."
3,5,"Была постоянная ошибка с подключением, почти н..."
4,1,... сделайте русскую озвучку и будет 5 звёзд :...
5,3,Почему нет русского языка?
6,1,!
7,1,Постоянные проблемы с подключением!!!
8,5,Все идеально но отсутствия русского языка даёт...
9,1,Где великий и могучий??? Сделайте русский


In [155]:
tone[:,2] = list(initial_left_revs['imps'])

for i in range(tone.shape[1]):
    tone[:, i] = tone[:, i] - tone[:, i].min()
    print(i, tone[:, i].max())
    tone[:, i] = tone[:, i] /tone[:, i].max()

0 1.0
1 1.0
2 7.0
3 nan


In [59]:
#train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 0:2])))
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 2].reshape(tone[:, 2].shape[0], 1))))

scores1_imper_num = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_imper_num = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_imper_num = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_imper_num = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))


In [60]:
scores_imper_num = pd.DataFrame({
    "SVM" : scores1_imper_num.mean(),
    "Random forest" : scores2_imper_num.mean(),
    "Linear regression" : scores3_imper_num.mean(),
    "Naive Bayes" : scores4_imper_num.mean()
})

scores_imper_num

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.718228,0.02522,3.192849,0.69275
score_time,0.006605,0.018412,0.11299,0.040882
test_accuracy,0.793717,0.87248,0.834769,0.840807
test_f1,0.803886,0.873069,0.833148,0.848601
test_precision,0.788676,0.893784,0.866199,0.83195
test_recall,0.821011,0.854316,0.803301,0.867163


вроде норм

Считаем количество вопросительных слов. Встречаются в негативных отзывах ("Пропала кнопка вызова. Где её найти?!?!")

In [61]:
revs_for_interrog_words = revs_norm_neg.drop(revs_norm_neg.index.difference(revs_norm_neg_stop.index))
split_texts_for_interrog = revs_for_interrog_words.content.str.split()

In [62]:
revs.ix[0].content

'Интересный сюжет, затягивает. Много времени уходит на уроки, которые можно бы было сделать по разнообразнее. И зачем продолжать полеты на метле до 3-4 курса?? Чему там можно учиться. Насколько я помню, в книгах они быстренько закончили этот курс. \r\nХотелось бы иногда больше свободы действий. Например пропускать какие-то задания, выучить что-то самостоятельно, контактировать с другими персонажами или что-то подобное.\r\nДля начала очень неплохая игра'

In [63]:
tone[:, 3]

array([ nan,  nan,  nan, ...,  nan,  nan,  nan])

In [156]:
interrogative_words = ["где", "как", "зачем", "почему", "какой"]

for i, text in enumerate(split_texts_for_interrog):
    count = 0
    for j, word in enumerate(text):        
        if (word in interrogative_words):
            count += 1
    tone[i, 3] = count
        
for i, text in enumerate(initial_left_revs.content):
    num = 0
    for e in text:
        if e == '?':
            num += 1
            #tone[i, 1] = 1
    tone[i, 3] += num

In [157]:
for i in range(tone.shape[1]):
    tone[:, i] = tone[:, i] - tone[:, i].min()
    print (i, tone[:, i].max())
    tone[:, i] = tone[:, i] /tone[:, i].max()

0 1.0
1 1.0
2 1.0
3 115.0


In [66]:
#train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 0:3])))
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 3].reshape(tone[:, 3].shape[0], 1))))

scores1_interr_num = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_interr_num = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_interr_num = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_interr_num = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [67]:
scores_interr_num = pd.DataFrame({
    "SVM" : scores1_interr_num.mean(),
    "Random forest" : scores2_interr_num.mean(),
    "Linear regression" : scores3_interr_num.mean(),
    "Naive Bayes" : scores4_interr_num.mean()
})

scores_interr_num

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.909753,0.020615,3.568472,0.540784
score_time,0.009207,0.010807,0.121684,0.006008
test_accuracy,0.793588,0.872608,0.835988,0.840743
test_f1,0.803828,0.873237,0.834917,0.848508
test_precision,0.788525,0.893612,0.865656,0.831863
test_recall,0.821136,0.854815,0.807292,0.867038


In [68]:
len(initial_left_revs)

15566

In [69]:
initial_left_revs.content[0]

'Интересный сюжет, затягивает. Много времени уходит на уроки, которые можно бы было сделать по разнообразнее. И зачем продолжать полеты на метле до 3-4 курса?? Чему там можно учиться. Насколько я помню, в книгах они быстренько закончили этот курс. \r\nХотелось бы иногда больше свободы действий. Например пропускать какие-то задания, выучить что-то самостоятельно, контактировать с другими персонажами или что-то подобное.\r\nДля начала очень неплохая игра'

In [158]:
def count_nums(line):
    count = 0
    for letter in line:
        if letter >= "0" and letter <= "9":
            count += 1
    return count

digits_num = np.zeros((len (initial_left_revs), 1))
i = 0
for line in initial_left_revs.content:
    digits_num[i] = count_nums(line)
    i += 1
    
digits_num = digits_num - digits_num.min()
print(digits_num.max())
digits_num = digits_num /digits_num.max()

71.0


In [71]:
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), digits_num)))

scores1_digits_num = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_digits_num = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_digits_num = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_digits_num = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [72]:
scores_digits_num = pd.DataFrame({
    "SVM" : scores1_digits_num.mean(),
    "Random forest" : scores2_digits_num.mean(),
    "Linear regression" : scores3_digits_num.mean(),
    "Naive Bayes" : scores4_digits_num.mean()
})

scores_digits_num


Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.54049,0.015011,3.07574,0.811639
score_time,0.006604,0.011408,0.102881,0.006004
test_accuracy,0.793396,0.872544,0.833805,0.840807
test_f1,0.803455,0.873126,0.831278,0.848569
test_precision,0.788806,0.893801,0.869901,0.832021
test_recall,0.820138,0.85444,0.796688,0.867038


In [73]:
def count_capital_letters(line):
    count = 0
    for letter in line:
        if letter >= "А" and letter <= "Я":
            count += 1
    return count

capital_letters_num = np.zeros((len (initial_left_revs), 1))
i = 0
for line in initial_left_revs.content:
    capital_letters_num[i] = count_capital_letters(line)
    i += 1
    
capital_letters_num = capital_letters_num - capital_letters_num.min()
capital_letters_num = capital_letters_num /capital_letters_num.max()

In [74]:
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), capital_letters_num)))

scores1_capital_letters_num = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_capital_letters_num = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_capital_letters_num = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_capital_letters_num = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [75]:
scores_capital_letters_num = pd.DataFrame({
    "SVM" : scores1_capital_letters_num.mean(),
    "Random forest" : scores2_capital_letters_num.mean(),
    "Linear regression" : scores3_capital_letters_num.mean(),
    "Naive Bayes" : scores4_capital_letters_num.mean()
})

scores_capital_letters_num

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.670184,0.018614,2.847939,0.600626
score_time,0.006805,0.008204,0.096253,0.007205
test_accuracy,0.79346,0.872351,0.830722,0.840743
test_f1,0.803518,0.872937,0.829243,0.848458
test_precision,0.788827,0.893557,0.861192,0.832078
test_recall,0.820138,0.854316,0.800433,0.866664


In [76]:
def find_elongated_words(word):
    if (word == ""):
        return False
    
    prev_letter = word[0]
    flag = 0
    count = 0
    for letter in word[1:]:
        if letter == prev_letter:
            count += 1
        else:
            count = 0;
            prev_letter = letter
        if count >= 2:
            flag = 1
    return flag

elongated_words_presense = np.zeros((len (initial_left_revs), 1))
i = 0
for line in initial_left_revs.content:
    elongated_words_presense[i] = find_elongated_words(line)
    i += 1

In [77]:
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), elongated_words_presense)))

scores1_elongated_words_presense = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_elongated_words_presense = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_elongated_words_presense = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_elongated_words_presense = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [78]:
scores_elongated_words_presense = pd.DataFrame({
    "SVM" : scores1_elongated_words_presense.mean(),
    "Random forest" : scores2_elongated_words_presense.mean(),
    "Linear regression" : scores3_elongated_words_presense.mean(),
    "Naive Bayes" : scores4_elongated_words_presense.mean()
})

scores_elongated_words_presense

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.619009,0.016015,2.94381,0.725452
score_time,0.006204,0.007001,0.099476,0.006004
test_accuracy,0.794809,0.872416,0.83117,0.839908
test_f1,0.804523,0.873024,0.830016,0.847714
test_precision,0.790847,0.893564,0.860856,0.831011
test_recall,0.820013,0.85444,0.802051,0.86629


In [79]:
def getPastTenseVerbs(line, should_delete = {'PREP', 'CONJ', 'PRCL', 'INTJ'}):
    ans = 0
    words = re.findall(r"[\w]+[\w|-]*", line)
    for word in words:
        lexInfo = morph.parse(word)[0]
        if (lexInfo.tag.POS == 'VERB'):
            if (lexInfo.tag.tense == 'past'):
                ans += 1
    if (ans > 0):
        return ans
    else:
        return 0
    return ans


past_tense_words_num = np.zeros((len (initial_left_revs), 1))
i = 0
for line in initial_left_revs.content:
    past_tense_words_num[i] = getPastTenseVerbs(line)
    i += 1
    
past_tense_words_num = past_tense_words_num - past_tense_words_num.min()
past_tense_words_num = past_tense_words_num /past_tense_words_num.max() 

In [80]:
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), past_tense_words_num)))

scores1_past_tense_words_num = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_past_tense_words_num = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_past_tense_words_num = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_past_tense_words_num = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [81]:
scores_past_tense_words_num = pd.DataFrame({
    "SVM" : scores1_past_tense_words_num.mean(),
    "Random forest" : scores2_past_tense_words_num.mean(),
    "Linear regression" : scores3_past_tense_words_num.mean(),
    "Naive Bayes" : scores4_past_tense_words_num.mean()
})

scores_past_tense_words_num

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,1.504487,0.02147,3.389095,0.621799
score_time,0.008587,0.011675,0.114791,0.030906
test_accuracy,0.794038,0.872287,0.829886,0.840422
test_f1,0.804224,0.872852,0.826911,0.848176
test_precision,0.788948,0.893641,0.867464,0.831641
test_recall,0.821385,0.854066,0.790575,0.866539


In [82]:
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone, elongated_words_presense, digits_num)))


In [83]:

scores1_all_features = pd.DataFrame(cross_validate(clf_svm, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores2_all_features = pd.DataFrame(cross_validate(clf_rf, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores3_all_features = pd.DataFrame(cross_validate(clf_lr, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores4_all_features = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [84]:
train.shape

(15566, 10006)

In [85]:
scores_all_features = pd.DataFrame({
    "SVM" : scores1_all_features.mean(),
    "Random forest" : scores2_all_features.mean(),
    "Linear regression" : scores3_all_features.mean(),
    "Naive Bayes" : scores4_all_features.mean()
})

scores_all_features

Unnamed: 0,Linear regression,Naive Bayes,Random forest,SVM
fit_time,2.001776,0.015236,2.913022,0.759014
score_time,0.003425,0.008001,0.097154,0.005013
test_accuracy,0.802968,0.874022,0.842801,0.840679
test_f1,0.811094,0.875745,0.838028,0.847477
test_precision,0.802193,0.888199,0.890637,0.835928
test_recall,0.821885,0.864544,0.792076,0.860802


In [86]:
result_metrics_svm = pd.DataFrame({
    "00 униграммы и биграммы" : scores_uni_bi['SVM'],
    "01 + обработка отрицаний" : scores_uni_bi_neg['SVM'],
    "02 + удаление стоп слов" : scores_uni_bi_neg_stop['SVM'],
    "03 + словарь тональности" : scores_tone_dict['SVM'],
    "04 + количество восклицательных знаков" : scores_excl_num['SVM'],
    "05 + количество глаголов в повел. накл." : scores_imper_num['SVM'],
    "06 + количество вопросительных слов" : scores_interr_num['SVM'],
    "07 + количество слов" : scores_words_num['SVM'], 
    "08 + наличие слов с повторяющимися буквами" : scores_elongated_words_presense['SVM'],
    "09 + количество заглавных букв" : scores_capital_letters_num['SVM'],
    "10 + количство глаголов прошедшего времени" : scores_past_tense_words_num['SVM'],
    "11 + количество цифр" : scores_digits_num['SVM'],
    "12 все признаки" : scores_all_features['SVM']
    
})

result_metrics_svm

Unnamed: 0,00 униграммы и биграммы,01 + обработка отрицаний,02 + удаление стоп слов,03 + словарь тональности,04 + количество восклицательных знаков,05 + количество глаголов в повел. накл.,06 + количество вопросительных слов,07 + количество слов,08 + наличие слов с повторяющимися буквами,09 + количество заглавных букв,10 + количство глаголов прошедшего времени,11 + количество цифр,12 все признаки
fit_time,0.789617,0.477643,0.38452,0.628848,0.82794,0.69275,0.540784,0.606985,0.725452,0.600626,0.621799,0.811639,0.759014
score_time,0.006003,0.003838,0.004472,0.006821,0.005403,0.040882,0.006008,0.006709,0.006004,0.007205,0.030906,0.006004,0.005013
test_accuracy,0.830914,0.840871,0.838594,0.84145,0.840293,0.840807,0.840743,0.840807,0.839908,0.840743,0.840422,0.840807,0.840679
test_f1,0.838606,0.848583,0.846284,0.848292,0.848176,0.848601,0.848508,0.848531,0.847714,0.848458,0.848176,0.848569,0.847477
test_precision,0.824917,0.832206,0.830912,0.836271,0.831115,0.83195,0.831863,0.832095,0.831011,0.832078,0.831641,0.832021,0.835928
test_recall,0.854442,0.866789,0.863585,0.86205,0.867163,0.867163,0.867038,0.866789,0.86629,0.866664,0.866539,0.867038,0.860802


In [87]:
result_metrics_nb = pd.DataFrame({
    "00 униграммы и биграммы" : scores_uni_bi['Naive Bayes'],
    "01 + обработка отрицаний" : scores_uni_bi_neg['Naive Bayes'],
    "02 + удаление стоп слов" : scores_uni_bi_neg_stop['Naive Bayes'],
    "03 + словарь тональности" : scores_tone_dict['Naive Bayes'],
    "04 + количество восклицательных знаков" : scores_excl_num['Naive Bayes'],
    "05 + количество глаголов в повел. накл." : scores_imper_num['Naive Bayes'],
    "06 + количество вопросительных слов" : scores_interr_num['Naive Bayes'],
    "07 + количество слов" : scores_words_num['Naive Bayes'],
    "08 + наличие слов с повторяющимися буквами" : scores_elongated_words_presense['Naive Bayes'],
    "09 + количество заглавных букв" : scores_capital_letters_num['Naive Bayes'],
    "10 + количство глаголов прошедшего времени" : scores_past_tense_words_num['Naive Bayes'],
    "11 + количество цифр" : scores_digits_num['Naive Bayes'],
    "12 все признаки" : scores_all_features['Naive Bayes']

})

result_metrics_nb

Unnamed: 0,00 униграммы и биграммы,01 + обработка отрицаний,02 + удаление стоп слов,03 + словарь тональности,04 + количество восклицательных знаков,05 + количество глаголов в повел. накл.,06 + количество вопросительных слов,07 + количество слов,08 + наличие слов с повторяющимися буквами,09 + количество заглавных букв,10 + количство глаголов прошедшего времени,11 + количество цифр,12 все признаки
fit_time,0.014658,0.016804,0.016621,0.019102,0.019076,0.02522,0.020615,0.017053,0.016015,0.018614,0.02147,0.015011,0.015236
score_time,0.008424,0.007019,0.004422,0.003401,0.005763,0.018412,0.010807,0.0064,0.007001,0.008204,0.011675,0.011408,0.008001
test_accuracy,0.86053,0.872351,0.865271,0.874022,0.872416,0.87248,0.872608,0.872351,0.872416,0.872351,0.872287,0.872544,0.874022
test_f1,0.86216,0.872937,0.866454,0.875742,0.873012,0.873069,0.873237,0.872937,0.873024,0.872937,0.872852,0.873126,0.875745
test_precision,0.876102,0.893557,0.883463,0.888172,0.893572,0.893784,0.893612,0.893557,0.893564,0.893557,0.893641,0.893801,0.888199
test_recall,0.8497,0.854316,0.851104,0.864544,0.85444,0.854316,0.854815,0.854316,0.85444,0.854316,0.854066,0.85444,0.864544


In [88]:
result_metrics_rf = pd.DataFrame({
    "00 униграммы и биграммы" : scores_uni_bi['Random forest'],
    "01 + обработка отрицаний" : scores_uni_bi_neg['Random forest'],
    "02 + удаление стоп слов" : scores_uni_bi_neg_stop['Random forest'],
    "03 + словарь тональности" : scores_tone_dict['Random forest'],
    "04 + количество восклицательных знаков" : scores_excl_num['Random forest'],
    "05 + количество глаголов в повел. накл." : scores_imper_num['Random forest'],
    "06 + количество вопросительных слов" : scores_interr_num['Random forest'],
    "07 + количество слов" : scores_words_num['Random forest'],
    "08 + наличие слов с повторяющимися буквами" : scores_elongated_words_presense['Random forest'],
    "09 + количество заглавных букв" : scores_capital_letters_num['Random forest'],
    "10 + количство глаголов прошедшего времени" : scores_past_tense_words_num['Random forest'],
    "11 + количество цифр" : scores_digits_num['Random forest'],
    "12 все признаки" : scores_all_features['Random forest']

})

result_metrics_rf

Unnamed: 0,00 униграммы и биграммы,01 + обработка отрицаний,02 + удаление стоп слов,03 + словарь тональности,04 + количество восклицательных знаков,05 + количество глаголов в повел. накл.,06 + количество вопросительных слов,07 + количество слов,08 + наличие слов с повторяющимися буквами,09 + количество заглавных букв,10 + количство глаголов прошедшего времени,11 + количество цифр,12 все признаки
fit_time,3.054008,2.672456,2.733623,3.215584,3.034394,3.192849,3.568472,3.039663,2.94381,2.847939,3.389095,3.07574,2.913022
score_time,0.106501,0.094701,0.100525,0.116686,0.1241,0.11299,0.121684,0.10324,0.099476,0.096253,0.114791,0.102881,0.097154
test_accuracy,0.820635,0.835154,0.835574,0.843186,0.834448,0.834769,0.835988,0.834384,0.83117,0.830722,0.829886,0.833805,0.842801
test_f1,0.818175,0.833355,0.835286,0.83942,0.832831,0.833148,0.834917,0.82958,0.830016,0.829243,0.826911,0.831278,0.838028
test_precision,0.855275,0.868201,0.861383,0.885489,0.866008,0.866199,0.865656,0.881212,0.860856,0.861192,0.867464,0.869901,0.890637
test_recall,0.784712,0.801928,0.811665,0.798938,0.802926,0.803301,0.807292,0.784466,0.802051,0.800433,0.790575,0.796688,0.792076


In [89]:
result_metrics_lr = pd.DataFrame({
    "00 униграммы и биграммы" : scores_uni_bi['Linear regression'],
    "01 + обработка отрицаний" : scores_uni_bi_neg['Linear regression'],
    "02 + удаление стоп слов" : scores_uni_bi_neg_stop['Linear regression'],
    "03 + словарь тональности" : scores_tone_dict['Linear regression'],
    "04 + количество восклицательных знаков" : scores_excl_num['Linear regression'],
    "05 + количество глаголов в повел. накл." : scores_imper_num['Linear regression'],
    "06 + количество вопросительных слов" : scores_interr_num['Linear regression'],
    "07 + количество слов" : scores_words_num['Linear regression'],
    "08 + наличие слов с повторяющимися буквами" : scores_elongated_words_presense['Linear regression'],
    "09 + количество заглавных букв" : scores_capital_letters_num['Linear regression'],
    "10 + количство глаголов прошедшего времени" : scores_past_tense_words_num['Linear regression'],
    "11 + количество цифр" : scores_digits_num['Linear regression'],
    "12 все признаки" : scores_all_features['Linear regression']

})

result_metrics_lr

Unnamed: 0,00 униграммы и биграммы,01 + обработка отрицаний,02 + удаление стоп слов,03 + словарь тональности,04 + количество восклицательных знаков,05 + количество глаголов в повел. накл.,06 + количество вопросительных слов,07 + количество слов,08 + наличие слов с повторяющимися буквами,09 + количество заглавных букв,10 + количство глаголов прошедшего времени,11 + количество цифр,12 все признаки
fit_time,1.373482,1.373652,1.388208,1.27133,1.702181,1.718228,1.909753,2.411731,1.619009,1.670184,1.504487,1.54049,2.001776
score_time,0.009668,0.007997,0.0072,0.009064,0.006456,0.006605,0.009207,0.024215,0.006204,0.006805,0.008587,0.006604,0.003425
test_accuracy,0.788707,0.792689,0.789033,0.802132,0.793203,0.793717,0.793588,0.794359,0.794809,0.79346,0.794038,0.793396,0.802968
test_f1,0.79948,0.802789,0.799947,0.810888,0.803341,0.803886,0.803828,0.803998,0.804523,0.803518,0.804224,0.803455,0.811094
test_precision,0.783207,0.788065,0.782154,0.799683,0.788294,0.788676,0.788525,0.790651,0.790847,0.788827,0.788948,0.788806,0.802193
test_recall,0.818644,0.819389,0.820524,0.824005,0.820262,0.821011,0.821136,0.819265,0.820013,0.820138,0.821385,0.820138,0.821885


In [96]:
X_train, X_test, y_train, y_test = train_test_split(train, revs_norm_neg_stop.rating, test_size=0.1) #random_state = 42
classifier = clf_nb.fit(X=X_train, y=y_train)
y_pred = classifier.predict(X_test)

import pickle

with open('dumped_nb_classifier.pkl', 'wb') as fid:
    pickle.dump(classifier, fid) 

count_vectorizer_uni_bi = CountVectorizer(binary = True, ngram_range=(1,2), max_features= 10000)
cvect_uni_bi_neg = count_vectorizer_uni_bi.fit(norm_texts_neg)
    
    
with open('dumped_count_vectorizer.pkl', 'wb') as fid:
    pickle.dump(cvect_uni_bi_neg, fid)
    
print_wrong_class (y_test, y_pred)

53 	class= 1 
  добавить русский язык 

 Добавьте русский язык пожалуйста! 

125 	class= 1 
  нет русский язык исправт 

 Нет русского языка. Исправте пожалуйста. 

170 	class= 1 
  почему написать русский есть он нет 

 Почему написано что русский есть когда его нет?!! 

247 	class= 1 
  добавить русский язык 

 Добавьте пожалуйста русский язык ? 

267 	class= 1 
  добавить русский язык 

 Добавьте пожалуйста русский язык ? 

333 	class= 0 
  не_русский язык игра не_интересный 

 без русского языка игра не интересна 

462 	class= 0 
  русский добавить 

 Русский добавьте 

829 	class= 0 
  весь интересно она иероглиф 

 Все интересно только она на иероглифах. 

1182 	class= 0 
  невозможно пройти зомби вирус сложный уровень вирус мутировать заразить миллиард весь болезнь обнаружить уже делать лекарство распространение слишком слабый пройти скрытно симптом становиться невозможно не_хватать очки днк симптом 

 Невозможно пройти зомби вирус на сложном уровне, вирус просто мутирует когда 

In [121]:
digits_num.ndim

2

In [143]:
train = np.hstack((tone, elongated_words_presense, digits_num, capital_letters_num, past_tense_words_num))

In [144]:
train.shape

(15566, 8)

In [147]:

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

clf = MultinomialNB()

efs1 = EFS(clf, 
           min_features=1,
           max_features=8,
           scoring='f1',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(train, revs_norm_neg_stop.rating)

Features: 255/255

In [148]:
print('Best f1 score: %.2f' % efs1.best_score_)
print('Best subset:', efs1.best_idx_)

Best f1 score: 0.69
Best subset: (0, 1, 3, 5, 6, 7)


In [136]:
train = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, 0:1], digits_num)))
scores_selected_features = pd.DataFrame(cross_validate(clf_nb, train, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [142]:
scores_selected_features.mean()

fit_time          0.043944
score_time        0.021010
test_accuracy     0.874086
test_f1           0.875799
test_precision    0.888304
test_recall       0.864544
dtype: float64

In [None]:
importances_num = pd.DataFrame(importances)
importances_num = importances_num.drop(importances_num[importances_num[0] > 0.001].index)
importances_num

In [None]:
new_matr = pd.DataFrame(train.todense())

In [None]:
new_matr = new_matr.drop(importances_num.index, axis =1)
new_matr

In [None]:
train_clean = sparse.csr_matrix(new_matr)

scores_nb_clean = pd.DataFrame(cross_validate(clf_nb, train_clean, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [None]:
scores_nb_clean.mean()

In [None]:
train01 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [0,1]])))
train02 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [0,2]])))
train03 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [0,3]])))
train12 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [1,2]])))
train13 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [1,3]])))
train23 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [2,3]])))

train012 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [0,1,2]])))
train013 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [0,1,3]])))
train123 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [1,2,3]])))
train023 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone[:, [0,2,3]])))

train0123 = sparse.csr_matrix(np.hstack((cvect_uni_bi_neg_stop.todense(), tone)))

In [None]:
scores_nb01 = pd.DataFrame(cross_validate(clf_nb, train01, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb02 = pd.DataFrame(cross_validate(clf_nb, train02, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb03 = pd.DataFrame(cross_validate(clf_nb, train03, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb12 = pd.DataFrame(cross_validate(clf_nb, train12, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb13 = pd.DataFrame(cross_validate(clf_nb, train13, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb23 = pd.DataFrame(cross_validate(clf_nb, train23, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

scores_nb012 = pd.DataFrame(cross_validate(clf_nb, train012, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb013 = pd.DataFrame(cross_validate(clf_nb, train013, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb123 = pd.DataFrame(cross_validate(clf_nb, train123, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))
scores_nb023 = pd.DataFrame(cross_validate(clf_nb, train023, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

scores_nb0123 = pd.DataFrame(cross_validate(clf_nb, train0123, revs_norm_neg_stop.rating, scoring=scoring, cv=5, return_train_score=False))

In [None]:
feat_comb_results = pd.DataFrame({
    "01" : scores_nb01.mean(),
    "02" : scores_nb02.mean(),
    "03" : scores_nb03.mean(),
    "12" : scores_nb12.mean(),
    "13" : scores_nb13.mean(),
    "23" : scores_nb23.mean(),
    
    "012" : scores_nb012.mean(),
    "013" : scores_nb013.mean(),
    "123" : scores_nb123.mean(),
    "023" : scores_nb023.mean(),
    
    "0123" : scores_nb0123.mean(),
})

feat_comb_results