In [1]:
import pandas as pd
import pymorphy2

from tqdm import tqdm
tqdm.pandas()

from nltk.tokenize import word_tokenize 
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import eli5

In [2]:
morph = pymorphy2.MorphAnalyzer()

In [3]:
df=pd.read_csv('women-clothing-accessories.3-class.balanced.csv', encoding='utf8', sep='\t')

In [4]:
df['sentiment'].value_counts()

negative    30000
neautral    30000
positive    30000
Name: sentiment, dtype: int64

In [5]:
df=df[df['sentiment']!='neautral']

In [6]:
df.iloc[0]['review']

'качество плохое пошив ужасный (горловина наперекос) Фото не соответствует Ткань ужасная рисунок блеклый маленький рукав не такой УЖАС!!!!! не стоит за такие деньги г.......'

In [7]:
df['review_processed'] = df['review'].apply(lambda x: re.sub(r'[^\w\s]','',x)).values

In [8]:
df['review_processed'] = df['review_processed'].progress_apply(lambda x:word_tokenize(x))

100%|██████████████████████████████████████████████████████████████████████████| 60000/60000 [00:10<00:00, 5791.17it/s]


In [9]:
df['review_processed'].iloc[0]

['качество',
 'плохое',
 'пошив',
 'ужасный',
 'горловина',
 'наперекос',
 'Фото',
 'не',
 'соответствует',
 'Ткань',
 'ужасная',
 'рисунок',
 'блеклый',
 'маленький',
 'рукав',
 'не',
 'такой',
 'УЖАС',
 'не',
 'стоит',
 'за',
 'такие',
 'деньги',
 'г']

In [10]:
df['review_lemmatized'] = df['review_processed'].progress_apply(lambda x: [morph.parse(w)[0].normal_form for w in x])

100%|███████████████████████████████████████████████████████████████████████████| 60000/60000 [06:03<00:00, 165.15it/s]


In [11]:
df['review_lemmatized'].iloc[2]

['ужасный',
 'синтетик',
 'тонкий',
 'ничего',
 'общий',
 'с',
 'представить',
 'картинка',
 'не',
 'яркий',
 'рисунок',
 'растянутый',
 'и',
 'тусклый',
 'впрочем',
 'как',
 'и',
 'сам',
 'кофта',
 'мешок',
 'на',
 'картинка',
 'казаться',
 'приталенный',
 'на',
 'сам',
 'дело',
 'нет',
 'не',
 'рекомендовать']

In [20]:
vectorizer=TfidfVectorizer(ngram_range=(1,2))
X=vectorizer.fit_transform(df['review_lemmatized'].apply(lambda x: ' '.join(x)))
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'],test_size=0.3, random_state=42)

In [26]:
X.shape

(60000, 396100)

In [21]:
n_components = list(range(1,X.shape[1]+1,1))
parameters = dict(pca__n_components=n_components)

logit=LogisticRegression()

#grid = GridSearchCV(logit,parameters)

logit.fit(X_train, y_train)

In [22]:
y_pred = logit.predict_proba(X_test)[:,1]

In [23]:
roc_auc_score(y_test,y_pred)

0.9777779791573166

In [24]:
eli5.show_weights(estimator=logit,
                feature_names= list(vectorizer.get_feature_names_out()),
                top=(50, 50))

Weight?,Feature
+10.460,отличный
+10.075,хороший
+7.917,супер
+7.339,спасибо
+7.282,немного
+6.904,хорошо
+6.754,отлично
+6.230,приятный
+6.177,классный
+6.061,довольный


In [36]:
from gensim.models import Phrases

bigram = Phrases(df['review_lemmatized'].values, min_count=2, threshold=2)
print(bigram[df['review_lemmatized'].iloc[0]])

['качество_плохой', 'пошив_ужасный', 'горловина', 'наперекос', 'фото', 'не_соответствовать', 'ткань_ужасный', 'рисунок', 'блёклый', 'маленький', 'рукав', 'не', 'такой', 'ужас', 'не_стоить', 'за_такой', 'деньга', 'г']


In [40]:
df['review_lemmatized_coll'] = df['review_lemmatized'].progress_apply(lambda x: bigram[x])

100%|█████████████████████████████████████████████████████████████████████████| 60000/60000 [00:03<00:00, 15580.46it/s]


In [43]:
df.head()

Unnamed: 0,review,sentiment,review_processed,review_lemmatized,review_lemmatized_coll
0,качество плохое пошив ужасный (горловина напер...,negative,"[качество, плохое, пошив, ужасный, горловина, ...","[качество, плохой, пошив, ужасный, горловина, ...","[качество_плохой, пошив_ужасный, горловина, на..."
1,"Товар отдали другому человеку, я не получила п...",negative,"[Товар, отдали, другому, человеку, я, не, полу...","[товар, отдать, другой, человек, я, не, получи...","[товар, отдать, другой_человек, я, не_получить..."
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative,"[Ужасная, синтетика, Тонкая, ничего, общего, с...","[ужасный, синтетик, тонкий, ничего, общий, с, ...","[ужасный_синтетик, тонкий, ничего_общий, с, пр..."
3,"товар не пришел, продавец продлил защиту без м...",negative,"[товар, не, пришел, продавец, продлил, защиту,...","[товар, не, прийти, продавец, продлить, защита...","[товар_не, прийти, продавец_продлить, защита, ..."
4,"Кофточка голая синтетика, носить не возможно.",negative,"[Кофточка, голая, синтетика, носить, не, возмо...","[кофточка, голый, синтетик, носить, не, возможно]","[кофточка, голый_синтетик, носить, не_возможно]"


In [45]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X = vectorizer.fit_transform(df['review_lemmatized_coll'].apply(lambda x: ' '.join(x)))

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'],test_size=0.3, random_state=42)

In [47]:
logit=LogisticRegression()

logit.fit(X_train, y_train)

In [48]:
y_pred = logit.predict_proba(X_test)[:,1]

In [49]:
roc_auc_score(y_test,y_pred)

0.9724951449972188

In [50]:
eli5.show_weights(estimator=logit,
                feature_names= list(vectorizer.get_feature_names_out()),
                top=(50, 50))

Weight?,Feature
+5.807,отличный
+5.590,спасибо_продавец
+5.572,супер
+5.549,рекомендовать
+5.489,хороший
+5.356,отлично
+5.266,очень_довольный
+5.083,немного
+4.871,качество_отличный
+4.856,спасибо


In [57]:
parameters = {'C':[0.5, 1, 10, 20, 100],
             'max_iter': [50, 100, 300, 500]}

In [58]:
clf = GridSearchCV(logit, parameters)

In [59]:
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [61]:
clf.best_params_

{'C': 10, 'max_iter': 50}

In [62]:
y_pred = logit.predict_proba(X_test)[:,1]

In [63]:
roc_auc_score(y_test,y_pred)

0.9724951449972188