# Imports

In [1]:
from time import time

import numpy as np
import pandas as pd

import spacy

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
QTY = 5000

In [5]:
X_str = df['review'][:QTY]
y = df['sentiment'][:QTY].replace({'negative': 0, 'positive': 1})

In [6]:
nlp = spacy.load('en_core_web_md')

In [7]:
# def get_vector(string: str) -> np.ndarray:
#     doc = nlp(string)
#     return doc.vector.reshape(1, -1)

In [8]:
start = time()

X_doc = X_str.apply(nlp)

print(f'Time: {time() - start:.0f} sec')

Time: 170 sec


In [9]:
X_doc

0       (One, of, the, other, reviewers, has, mentione...
1       (A, wonderful, little, production, ., <, br, /...
2       (I, thought, this, was, a, wonderful, way, to,...
3       (Basically, there, 's, a, family, where, a, li...
4       (Petter, Mattei, 's, ", Love, in, the, Time, o...
                              ...                        
4995    (An, interesting, slasher, film, with, multipl...
4996    (i, watched, this, series, when, it, first, ca...
4997    (Once, again, Jet, Li, brings, his, charismati...
4998    (I, rented, this, movie, ,, after, hearing, Ch...
4999    (This, was, a, big, disappointment, for, me, ....
Name: review, Length: 5000, dtype: object

In [10]:
X = np.zeros((1, 300))  # инициализируем матрицу векторов нулевым вектором-строкой
for doc in X_doc:
    vector = doc.vector.reshape(1, -1)
    X = np.vstack((X, vector))
X = np.delete(arr=X, obj=0, axis=0)  # удаляем первую нулевую строку в полученной матрице

In [11]:
X

array([[-1.75048316,  0.70223123, -2.26069546, ..., -0.75643164,
        -2.84037662,  1.16227973],
       [-1.72586012,  0.40414372, -0.70506632, ..., -1.227211  ,
        -3.30375242,  0.76335639],
       [-2.07742572,  1.27698457, -1.43668723, ...,  0.01261111,
        -3.35728073,  1.43638575],
       ...,
       [-1.26115525,  1.3370682 , -2.53299332, ..., -0.9334181 ,
        -3.5286839 ,  1.42038274],
       [-1.79231286,  0.87925476, -2.40937471, ..., -0.49132967,
        -3.41382909,  0.77604485],
       [-1.44713509,  1.85639131, -2.43208957, ...,  0.91516876,
        -3.60654378,  1.49083591]])

In [12]:
y.value_counts()

sentiment
0    2532
1    2468
Name: count, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
model = RandomForestClassifier(random_state=42)

model_cv_score = np.mean(
    cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy", n_jobs=-1)
)
print(f"Model cross validation score: {model_cv_score}\n")

Model cross validation score: 0.7254285714285714


In [15]:
model.fit(X_train, y_train)

In [16]:
y_predicted = model.predict(X_test)
model_test_score = accuracy_score(y_true=y_test, y_pred=y_predicted)

print(f"Model test set score: {model_test_score}")
print("\n", classification_report(y_true=y_test, y_pred=y_predicted))

Model test set score: 0.7273333333333334

               precision    recall  f1-score   support

           0       0.75      0.72      0.73       783
           1       0.71      0.74      0.72       717

    accuracy                           0.73      1500
   macro avg       0.73      0.73      0.73      1500
weighted avg       0.73      0.73      0.73      1500


In [44]:
X_doc_adj = [[token for token in doc if token.pos_ == "ADJ"] for doc in X_doc]
X_doc_adj

[[other,
  right,
  first,
  faint,
  hearted,
  timid,
  hardcore,
  classic,
  experimental,
  high,
  many,
  more,
  dodgy,
  shady,
  main,
  due,
  other,
  pretty,
  mainstream,
  first,
  nasty,
  surreal,
  ready,
  more,
  accustomed,
  high,
  graphic,
  crooked,
  mannered,
  middle,
  comfortable,
  uncomfortable,
  darker],
 [wonderful,
  little,
  unassuming-,
  old,
  entire,
  seamless,
  diary,
  worth,
  masterful,
  great,
  little,
  traditional,
  solid,
  flat],
 [wonderful,
  hot,
  light,
  hearted,
  simplistic,
  witty,
  likable,
  well,
  serial,
  many,
  most,
  impressed,
  sexy,
  young,
  wittier,
  interesting,
  great],
 [little, slower, watchable, divorcing, real, similar, meaningless, well],
 [stunning,
  vivid,
  human,
  different,
  same,
  present,
  different,
  next,
  previous,
  sophisticated,
  luxurious,
  own,
  different,
  big,
  best,
  human,
  sincere,
  most,
  good,
  talented,
  good,
  next],
 [favorite,
  noble,
  preachy,
  bo

In [55]:
X = np.zeros((1, 300))  # инициализируем матрицу векторов нулевым вектором-строкой

for review in X_doc_adj:
    vector = np.zeros((1, 300))
    for token in review:
        vector = vector + token.vector.reshape(1, -1)
    X = np.vstack((X, vector))
    
X = np.delete(arr=X, obj=0, axis=0)  # удаляем первую нулевую строку в полученной матрице

In [56]:
X

array([[ -17.4989627 ,    8.74101136,  -47.69689089, ...,   -4.8504659 ,
        -108.93192048,   27.42148976],
       [  -5.95481987,    1.24768019,  -35.45866981, ...,   23.77379212,
         -48.0553599 ,    2.11634983],
       [   3.21682034,    4.5065867 ,   -8.11926958, ...,   16.51653592,
         -81.03919047,   24.66795141],
       ...,
       [ -20.90796971,   -4.79205184,  -43.53521023, ...,  -16.50632   ,
         -84.73561037,   21.80960377],
       [  -8.72691394,  -36.47566926,  -52.34998294, ...,    3.59758867,
        -195.93839289,   38.54316453],
       [   9.45129979,    6.86559963,  -10.93554008, ...,    8.46449983,
          -9.4950999 ,    6.55409992]])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model.fit(X_train, y_train)

y_predicted = model.predict(X_test)
model_test_score = accuracy_score(y_true=y_test, y_pred=y_predicted)

print(f"Model test set score: {model_test_score}")
print("\n", classification_report(y_true=y_test, y_pred=y_predicted))

Model test set score: 0.7713333333333333

               precision    recall  f1-score   support

           0       0.79      0.77      0.78       783
           1       0.75      0.77      0.76       717

    accuracy                           0.77      1500
   macro avg       0.77      0.77      0.77      1500
weighted avg       0.77      0.77      0.77      1500
