In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pyprind

In [3]:
init_dir = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
reviews = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        dir = os.path.join(init_dir, s, l)
        for item in sorted(os.listdir(dir)):
            with open(os.path.join(dir, item), 'r', encoding='utf-8') as file:
                txt = file.read()
                reviews = pd.concat([reviews, pd.DataFrame({'review': [txt], 'label': [labels[l]]})], ignore_index=True)
                pbar.update()
reviews.columns = ['review', 'label']
print(reviews)

0% [##############################] 100% | ETA: 00:00:00

                                                  review  label
0      I went and saw this movie last night after bei...      1
1      Actor turned director Bill Paxton follows up h...      1
2      As a recreational golfer with some knowledge o...      1
3      I saw this film in a sneak preview, and it is ...      1
4      Bill Paxton has taken the true story of the 19...      1
...                                                  ...    ...
49995  Towards the end of the movie, I felt it was to...      0
49996  This is the kind of movie that my enemies cont...      0
49997  I saw 'Descent' last night at the Stockholm Fi...      0
49998  Some films that you pick up for a pound turn o...      0
49999  This is one of the dumbest films, I've ever se...      0

[50000 rows x 2 columns]



Total time elapsed: 00:03:27


Первые 3 рецензии

In [9]:
reviews.head(3)

Unnamed: 0,review,label
11841,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
19602,OK... so... I really like Kris Kristofferson a...,0
45519,"***SPOILER*** Do not read this, if you think a...",0


Последние 3 рецензии

In [8]:
reviews.tail(3)

Unnamed: 0,review,label
49997,I saw 'Descent' last night at the Stockholm Fi...,0
49998,Some films that you pick up for a pound turn o...,0
49999,"This is one of the dumbest films, I've ever se...",0


Перемешивание DataFrame с рецензиями

In [4]:
np.random.seed(0)
reviews = reviews.reindex(np.random.permutation(reviews.index))

In [5]:
reviews.shape

(50000, 2)

Первые 10 рецензий

In [27]:
reviews.head(10)

Unnamed: 0,review,label
11841,1974 teenager martha moxley maggie grace moves...,1
19602,ok really like kris kristofferson usual easy g...,0
45519,spoiler read think watching movie although wou...,0
25747,hi people seen wonderful movie im sure thet wo...,1
42642,recently bought dvd forgetting much hated movi...,0
31902,leave braik put good show finally zorak living...,1
30346,nathan detroit frank sinatra manager new york ...,1
12363,understand crash course right context must und...,1
32490,impressed chavez stance globalisation sometime...,1
26128,movie directed renny harlin finnish miracle st...,1


Последние 10 рецензий

In [26]:
reviews.tail(10)

Unnamed: 0,review,label
46884,ok think tv show kind cute always kind lesson ...,0
20757,big disappointment clash night much talky stag...,0
41993,cassidy kacia brady puts gun mouth blowing bac...,0
32103,rapid intercutting scenes insane people asylum...,1
30403,girlfight came reviews praised get around seei...,1
21243,ok lets start best building although hard beli...,0
45891,british heritage film industry control nothing...,0
42613,even know begin one family worst line dialogue...,0
43567,richard tyler little boy scared everything lik...,0
2732,waited long watch movie also like bruce willis...,1


In [8]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emotes = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emotes).replace('-', ''))
    st_words = stopwords.words('english')
    text = ' '.join([w for w in text.split() if w.lower() not in st_words])
    return text

  emotes = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emotes).replace('-', ''))
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
reviews['review'] = reviews['review'].apply(preprocessor)

In [37]:
x = reviews['review']
y = reviews['label']

Разделение данных на обучающий и тестовый наборы

In [38]:
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5)

In [21]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

vectorizer = TfidfVectorizer(preprocessor=None)

snowball = SnowballStemmer('english')
porter = PorterStemmer()
lancaster = LancasterStemmer()

def snowball_tokenizer(text):
    return [snowball.stem(w) for w in text.split()]

def porter_tokenizer(text):
    return [porter.stem(w) for w in text.split()]

def lancaster_tokenizer(text):
    return [lancaster.stem(w) for w in text.split()]

param_grid = [{'vect__ngram_range': [(1,1)],
                'vect__tokenizer': [snowball_tokenizer, porter_tokenizer, lancaster_tokenizer],
                'clf__penalty': ['l1', 'l2', 'elasticnet'],
                'clf__C': [1.0, 10.0, 100.0]}]

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

log_res_tfidf = Pipeline([('vect', vectorizer),
                          ('clf', LogisticRegression(random_state=0, solver='liblinear'))])
grid_search1 = GridSearchCV(log_res_tfidf, param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
grid_search1.fit(x_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\super\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\super\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\super\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\super\anaconda3\Lib\site-packages\sklearn\base.py", l

In [15]:
print('Лучшие параметры: %s ' % grid_search1.best_params_)
print('Лучшие параметры: %s ' % grid_search1.best_score_)

Лучшие параметры: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__tokenizer': <function snowball_tokenizer at 0x000001AF1FB60180>} 
Лучшие параметры: 0.88764 


In [43]:
vectorizer = TfidfVectorizer(tokenizer=snowball_tokenizer)
x_tfidf = vectorizer.fit_transform(x)



In [44]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size = 0.25)

Обучение логистической регрессионной модели

In [45]:
from sklearn.linear_model import LogisticRegression

log_res_model = LogisticRegression(random_state=0, solver='liblinear', penalty='l2', C=10.0)
log_res_model.fit(x_train, y_train)

Использование метода кросс-валидации

In [46]:
cv_score = cross_val_score(log_res_model, x_train, y_train)
print(f"Кросс-валидация: {cv_score}")
print(f"Среднее кросс-валидации: {cv_score.mean()}")

Кросс-валидация: [0.9004     0.88773333 0.90066667 0.88813333 0.88893333]
Среднее кросс-валидации: 0.8931733333333334


Предсказание на тестовом множестве с оценкой точности

In [47]:
from sklearn.metrics import accuracy_score

pred = log_res_model.predict(x_test)
acc = accuracy_score(y_test, pred)
print(f"Точность модели: {acc}")


Точность модели: 0.89544
