# Loading modules

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Reading data

In [2]:
neg = pd.read_csv('../data/negative.csv', index_col=0)
neu = pd.read_csv('../data/neutral.csv', index_col=0)
pos = pd.read_csv('../data/positive.csv', index_col=0)

# Preprocess

In [None]:
mystem = Mystem() 
swords = stopwords.words("russian") + ["еще", "ещё", "меж", "зато", "пусть", "ага", "этот", "это", "почему", 
                        "весь", "ты", "он", "она", "они", "оно", "мы", "вы", "кто", "что", 
                        "сам", "сама", "само", "свой", "наш", "ваш", "их", "тот", "та", "те", 
                        "то", "раз", "твой", "мой", "кой", "кое", "все", "весь", "всё", "быть", "тот", 
                        "таки", "такой", "какой", "каждый", "который", "и", "а", "в", "б", "д", 
                        "е", "ж", "з", "к", "л", "м", "н", "о", "п", "р", "с", "у", "ф", "ч", 
                        "ц", "ш", "щ", "ь", "ъ","э", "ю", "я"]
swords = [word for word in swords if word not in ['хорошо', 'лучше', 'может', 'никогда', 'нельзя', 'всегда']]
punctuation = ['\"', '\#', '\$', '\%', '\&', "\'", '\(', '\)', '\*', '\+', '\,', '\-', '\.', '\/', '\:', '\;', 
               '\<', '\=', '\>', '\@', '\[', '\\\\', '\]', '\^', '\_', '\`', '\{', '\|', '\}', '\~']

def preprocess_text(text):
    text = re.sub("\d+", "", text.lower())
    text = re.sub("\s+", " ", text)
    for char in punctuation:
        text = re.sub(char, "", text)
    tokens = mystem.lemmatize(text)
    tokens = [token for token in tokens if token not in swords and token != " "]
    text = " ".join(tokens)
    return text

In [None]:
p_texts = neg['preprocessed_text'] 
print(p_texts.shape)
p_texts = p_texts.append(neu['preprocessed_text'], ignore_index=True)
print(p_texts.shape)
p_texts = p_texts.append(pos['preprocessed_text'], ignore_index=True)
print(p_texts.shape)

# Merging in a dataframe

In [8]:
texts = neg['text'] 
print(texts.shape)
texts = texts.append(neu['text'], ignore_index=True)
print(texts.shape)
texts = texts.append(pos['text'], ignore_index=True)
print(texts.shape)

labels = [-1]*neg['text'].shape[0] + [0]*neu['text'].shape[0] + [1]*pos['text'].shape[0]

(19974,)
(124235,)
(167879,)


In [9]:
X = pd.DataFrame(texts)
X['preprocessed_text'] = p_texts
X['lables'] = labels

# Choosing vectorizer and model

## Scorer

In [3]:
def getScores(estimator, x, y):
    yPred = estimator.predict(x)
    return (accuracy_score(y, yPred), 
            precision_score(y, yPred, average='macro'), 
            recall_score(y, yPred, average='macro'),
            f1_score(y, yPred, average='macro'))

def my_scorer(estimator, x, y):
    a, p, r, f1 = getScores(estimator, x, y)
    print('Accuracy: %.3f  Precision: %.3f  Recall: %.3f  F1: %.3f' % (a, p, r, f1))
    return a+p+r+f1

## Unpreprocessed

In [47]:
print('unprocessed results:', '\n')
vectorizers = [CountVectorizer(), TfidfVectorizer()]
vnames = ['CountVectorizer', 'TfidfVectorizer'] 
models = [LinearSVC(), LogisticRegression()]
mnames = ['LinearSVC', 'LogisticRegression']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        print(vnames[vi], ' and ', mnames[mi], ':\n')
        res = cross_val_score(pline, texts, labels, scoring=my_scorer, cv=3).mean()
        print('\n Mean Sum: ', res, '\n-----------\n')

unprocessed results: 

CountVectorizer  and  LinearSVC :





Accuracy: 0.750  Precision: 0.705  Recall: 0.706  F1: 0.705
Accuracy: 0.749  Precision: 0.706  Recall: 0.703  F1: 0.704
Accuracy: 0.741  Precision: 0.697  Recall: 0.691  F1: 0.694

 Mean Sum:  2.85037813000717 
-----------

CountVectorizer  and  LogisticRegression :





Accuracy: 0.785  Precision: 0.769  Recall: 0.701  F1: 0.729
Accuracy: 0.758  Precision: 0.754  Recall: 0.648  F1: 0.684
Accuracy: 0.747  Precision: 0.745  Recall: 0.619  F1: 0.660

 Mean Sum:  2.8663524053025937 
-----------

TfidfVectorizer  and  LinearSVC :

Accuracy: 0.799  Precision: 0.778  Recall: 0.736  F1: 0.754
Accuracy: 0.797  Precision: 0.773  Recall: 0.733  F1: 0.751
Accuracy: 0.788  Precision: 0.768  Recall: 0.721  F1: 0.742

 Mean Sum:  3.0466409919748596 
-----------

TfidfVectorizer  and  LogisticRegression :





Accuracy: 0.796  Precision: 0.803  Recall: 0.693  F1: 0.733
Accuracy: 0.787  Precision: 0.783  Recall: 0.684  F1: 0.720
Accuracy: 0.776  Precision: 0.779  Recall: 0.667  F1: 0.707

 Mean Sum:  2.9755213625159542 
-----------



In [6]:
print('unprocessed results:', '\n')
vectorizers = [TfidfVectorizer(analyzer='word', ngram_range=(1,2)), 
               TfidfVectorizer(analyzer='word', ngram_range=(1,3))]
vnames = ['TfidfVectorizer ngram 1-2', 'TfidfVectorizer ngram 1-3'] 
models = [LinearSVC(C=0.5), LogisticRegression(C=2.3)]
mnames = ['LinearSVC c=0.5', 'LogisticRegression c=2.3']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        print(vnames[vi], ' and ', mnames[mi], ':\n')
        res = cross_val_score(pline, texts, labels, scoring=my_scorer, cv=3).mean()
        print('\n Mean Sum: ', res, '\n-----------\n')

unprocessed results: 

TfidfVectorizer ngram 1-2  and  LinearSVC c=2.3 :

Accuracy: 0.805  Precision: 0.787  Recall: 0.738  F1: 0.759
Accuracy: 0.803  Precision: 0.784  Recall: 0.735  F1: 0.756
Accuracy: 0.796  Precision: 0.780  Recall: 0.724  F1: 0.747

 Mean Sum:  3.071447187019318 
-----------

TfidfVectorizer ngram 1-2  and  LogisticRegression c=2.3 :





Accuracy: 0.806  Precision: 0.801  Recall: 0.723  F1: 0.754
Accuracy: 0.801  Precision: 0.791  Recall: 0.715  F1: 0.746
Accuracy: 0.793  Precision: 0.788  Recall: 0.703  F1: 0.737

 Mean Sum:  3.052607039764009 
-----------

TfidfVectorizer ngram 1-3  and  LinearSVC c=2.3 :

Accuracy: 0.802  Precision: 0.782  Recall: 0.737  F1: 0.757
Accuracy: 0.801  Precision: 0.780  Recall: 0.735  F1: 0.754
Accuracy: 0.794  Precision: 0.775  Recall: 0.722  F1: 0.745

 Mean Sum:  3.061695134271835 
-----------

TfidfVectorizer ngram 1-3  and  LogisticRegression c=2.3 :





Accuracy: 0.805  Precision: 0.799  Recall: 0.722  F1: 0.752
Accuracy: 0.800  Precision: 0.789  Recall: 0.715  F1: 0.745
Accuracy: 0.793  Precision: 0.787  Recall: 0.702  F1: 0.735

 Mean Sum:  3.0475140661681315 
-----------



## Preprocessed

In [25]:
p_vecs = TfidfVectorizer(analyzer='word', ngram_range=(1,2)).fit_transform(p_texts)

In [23]:
param_grid = {'C': [0.5, 1.0, 1.5, 2.3]}
grid = GridSearchCV(LinearSVC(), param_grid, cv=3)
grid.fit(p_vecs, labels)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)  

Best cross-validation score: 0.79
Best parameters:  {'C': 0.5}
Best estimator:  LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [24]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
grid.fit(p_vecs, labels)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_) 



Best cross-validation score: 0.79
Best parameters:  {'C': 2.3}
Best estimator:  LogisticRegression(C=2.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [27]:
param_grid = {'C': [0.1, 0.3, 0.5, 0.7, 1., 2.]}
grid = GridSearchCV(LinearSVC(), param_grid, cv=3)
grid.fit(p_vecs, labels)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)  

Best cross-validation score: 0.80
Best parameters:  {'C': 0.3}
Best estimator:  LinearSVC(C=0.3, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [33]:
param_grid = {'C': [2., 2.3, 2.5]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
grid.fit(p_vecs, labels)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_) 



Best cross-validation score: 0.80
Best parameters:  {'C': 2.5}
Best estimator:  LogisticRegression(C=2.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [34]:
print('processed results:', '\n')
vectorizers = [TfidfVectorizer(analyzer='word', ngram_range=(1,2))]
vnames = ['TfidfVectorizer ngram 1-2'] 
models = [LinearSVC(C=0.3), LogisticRegression(C=2.5)]
mnames = ['LinearSVC c=0.3', 'LogisticRegression c=2.5']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        print(vnames[vi], ' and ', mnames[mi], ':\n')
        res = cross_val_score(pline, p_texts, labels, scoring=my_scorer, cv=3).mean()
        print('\n Mean Sum: ', res, '\n-----------\n')

processed results: 

TfidfVectorizer ngram 1-2  and  LinearSVC c=0.3 :

Accuracy: 0.807  Precision: 0.795  Recall: 0.732  F1: 0.758
Accuracy: 0.803  Precision: 0.789  Recall: 0.727  F1: 0.753
Accuracy: 0.797  Precision: 0.786  Recall: 0.717  F1: 0.746

 Mean Sum:  3.070137837304214 
-----------

TfidfVectorizer ngram 1-2  and  LogisticRegression c=2.5 :





Accuracy: 0.806  Precision: 0.797  Recall: 0.726  F1: 0.755
Accuracy: 0.801  Precision: 0.789  Recall: 0.720  F1: 0.748
Accuracy: 0.794  Precision: 0.786  Recall: 0.709  F1: 0.740

 Mean Sum:  3.0574733140651844 
-----------



# Train Test Split

In [42]:
indices = np.arange(X.shape[0])
X_x, X_val, id_x, id_val = train_test_split(X, indices, test_size=0.1)
X_train, X_test, id_train, id_test = train_test_split(X_x, id_x, test_size=0.29)

In [83]:
X['class'] = 'train'

In [84]:
X.iloc[id_val, [3]] = 'val'

In [85]:
X.iloc[id_test, [3]] = 'test'

In [98]:
X.rename(columns={('lables'):('labels')}, inplace=True)

Unnamed: 0,text,preprocessed_text,labels,class
0,Секретная пресс-конференция ЕНПФ: отставки Аки...,секретный прессконференция енпф отставка акише...,-1,train
1,Почему суд продлил арест бывших руководителей ...,суд продлять арест бывший руководитель енпф не...,-1,val
2,"«Сумма премий работникам ""дочки"" МИР РК в 2014...",« сумма премия работник дочка мир рк год пр...,-1,train
3,Руководители крупных супермаркетов Алматы заяв...,руководитель крупный супермаркет алматы заявля...,-1,train
4,Экс-главе ЕНПФ Руслану Ерденаеву продлили арес...,эксглава енпф руслан ерденаев продлять арест э...,-1,train
5,Арест бывшего главы ЕНПФ Руслана Ерденаева про...,арест бывший глава енпф руслан ерденаев продля...,-1,train
6,Экс-главе ЕНПФ Руслану Ерденаеву продлили арес...,эксглава енпф руслан ерденаев продлять арест с...,-1,test
7,Суд продлил арест экс-главе ЕНПФ Руслану Ерден...,суд продлять арест эксглава енпф руслан ердена...,-1,val
8,Суд продлил арест экс-главе ЕНПФ Руслану Ерден...,суд продлять арест эксглава енпф руслан ердена...,-1,test
9,Суд продлил арест экс-главе ЕНПФ Руслану Ерден...,суд продлять арест эксглава енпф руслан ердена...,-1,test


# Checking

In [11]:
Vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2))

In [12]:
train_vecs = Vectorizer.fit_transform(X.preprocessed_text)

In [90]:
test_vecs =  Vectorizer.transform(X[X['class']=='test'].preprocessed_text)

In [91]:
val_vecs =  Vectorizer.transform(X[X['class']=='val'].preprocessed_text)

In [100]:
test_pred = model.predict(test_vecs)

In [101]:
test_y = X[X['class']=='test'].labels
print('Accuracy: %.3f  Precision: %.3f  Recall: %.3f  F1: %.3f' % 
      (accuracy_score(test_y, test_pred), 
       precision_score(test_y, test_pred, average='macro'),
       recall_score(test_y, test_pred, average='macro'),
       f1_score(test_y, test_pred, average='macro')))

Accuracy: 0.807  Precision: 0.797  Recall: 0.731  F1: 0.759


In [102]:
val_pred = model.predict(val_vecs)

In [103]:
val_y = X[X['class']=='val'].labels
print('Accuracy: %.3f  Precision: %.3f  Recall: %.3f  F1: %.3f' % 
      (accuracy_score(val_y, val_pred), 
       precision_score(val_y, val_pred, average='macro'),
       recall_score(val_y, val_pred, average='macro'),
       f1_score(val_y, val_pred, average='macro')))

Accuracy: 0.805  Precision: 0.798  Recall: 0.725  F1: 0.755


In [15]:
model = LinearSVC(C=0.3).fit(train_vecs, X.lables)

# Save

In [16]:
from sklearn.externals import joblib

joblib.dump(model, 'sentiment_model.pkl') 

['sentiment_model.pkl']

In [18]:
import pickle

In [19]:
with open('vectorizer_TfIdf.pkl', 'wb') as fin:
    pickle.dump(Vectorizer, fin)

In [21]:
X.to_csv('data_full.csv')