### Sentiment Analysis Using Scikit-learn

In [49]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import src.utils as ps
import re

import nltk; nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

from pymorphy2 import MorphAnalyzer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sych_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
pd.set_option('max_colwidth', 120)
pd.set_option('display.width', 500)

In [42]:
df = pd.read_csv('data/women-clothing-accessories.3-class.balanced.csv', encoding = 'utf-8', sep='\t')
df.head()

Unnamed: 0,review,sentiment
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative


In [43]:
df_pos['sentiment'].value_counts()

positive    30000
neautral    30000
negative    30000
Name: sentiment, dtype: int64

In [52]:
def to_lemmatize2(text):
    all_word_str = " ".join(text)
    all_word_list = all_word_str.split()
    all_unique_word = pd.Series(all_word_list).unique()
    lemmatized_word_dict = {}
    lemmatizer = MorphAnalyzer()
    for word in all_unique_word:
        lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0]
    text = ' '.join([lemmatized_word_dict[word] for word in text])
    return text, all_unique_word
        
def clean(text):
    deleted_symols = '[\\\\\'[\]!"$%&()*+,-./:;<=>?@^_`{|}~«»\n]'
    text = re.sub(deleted_symols, ' ', text)
        
    text = ' '.join([elem for elem in str(text).split(' ') if elem.isdigit() == False])
        
    text = text.lower()
    text = [token for token in text.split() if token not in russian_stopwords]

    text, _ = to_lemmatize2(text)
    return text

In [53]:
df['review'] = df['review'].apply(lambda x: clean(x))
df.head()

Unnamed: 0,review,sentiment
0,качество плохой пошив ужасный горловина напере...,negative
1,товар отдать другой человек получить посылка л...,negative
2,ужасный синтетик тонкий общий представить карт...,negative
3,товар прийти продавец продлить защита мой согл...,negative
4,кофточка голый синтетик носить возможно,negative


In [54]:
tfidf = TfidfVectorizer(max_features=5000)
X = df['review']
y = df['sentiment']

X = tfidf.fit_transform(X)
X

<90000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1080610 stored elements in Compressed Sparse Row format>

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [56]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [57]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    neautral       0.61      0.61      0.61      6050
    negative       0.70      0.71      0.70      5890
    positive       0.83      0.84      0.83      6060

    accuracy                           0.72     18000
   macro avg       0.72      0.72      0.72     18000
weighted avg       0.72      0.72      0.72     18000



In [62]:
x = 'не работает этот сервис, я уже устала от этого, кошмар'

x = clean(x)
vec = tfidf.transform([x])

In [63]:
vec.shape

(1, 5000)

In [64]:
clf.predict(vec)

array(['negative'], dtype=object)

In [58]:
import pickle
pickle.dump(clf, open('model/my_model', 'wb'))
pickle.dump(tfidf, open('model/my_tfidf', 'wb'))