In [195]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pyprind
import pickle

In [196]:
def load_reviews(init_dir):
    labels = {'pos': 1, 'neg': 0}
    pbar = pyprind.ProgBar(25000)
    df = pd.DataFrame()
    for l in ('pos', 'neg'):
        dir = os.path.join(init_dir, l)
        for item in sorted(os.listdir(dir)):
            with open(os.path.join(dir, item), 'r', encoding='utf-8') as file:
                txt = file.read()
                df = pd.concat([df, pd.DataFrame({'review': [txt], 'label': [labels[l]]})], ignore_index=True)               
                pbar.update()
    df.columns = ['review', 'label']
    return df

df_train = load_reviews('aclImdb/train')
df_test = load_reviews('aclImdb/test')

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:15
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:15


Первые 3 рецензии тренировочного набора

In [197]:
df_train.head(3)

Unnamed: 0,review,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1


Первые 3 рецензии тестового набора

In [198]:
df_test.head(3)

Unnamed: 0,review,label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [199]:
df_train.info

<bound method DataFrame.info of                                                   review  label
0      Bromwell High is a cartoon comedy. It ran at t...      1
1      Homelessness (or Houselessness as George Carli...      1
2      Brilliant over-acting by Lesley Ann Warren. Be...      1
3      This is easily the most underrated film inn th...      1
4      This is not the typical Mel Brooks film. It wa...      1
...                                                  ...    ...
24995  Towards the end of the movie, I felt it was to...      0
24996  This is the kind of movie that my enemies cont...      0
24997  I saw 'Descent' last night at the Stockholm Fi...      0
24998  Some films that you pick up for a pound turn o...      0
24999  This is one of the dumbest films, I've ever se...      0

[25000 rows x 2 columns]>

In [200]:
df_test.info

<bound method DataFrame.info of                                                   review  label
0      I went and saw this movie last night after bei...      1
1      Actor turned director Bill Paxton follows up h...      1
2      As a recreational golfer with some knowledge o...      1
3      I saw this film in a sneak preview, and it is ...      1
4      Bill Paxton has taken the true story of the 19...      1
...                                                  ...    ...
24995  I occasionally let my kids watch this garbage ...      0
24996  When all we have anymore is pretty much realit...      0
24997  The basic genre is a thriller intercut with an...      0
24998  Four things intrigued me as to this film - fir...      0
24999  David Bryce's comments nearby are exceptionall...      0

[25000 rows x 2 columns]>

Перемешивание DataFrame с рецензиями

In [201]:
np.random.seed(0)
df_train = df_train.reindex(np.random.permutation(df_train.index))
df_test = df_test.reindex(np.random.permutation(df_test.index))

Первые 10 рецензий тренировочного набора

In [202]:
df_train.head(10)

Unnamed: 0,review,label
14149,Forbidden Siren is based upon the Siren 2 Play...,0
8946,You better see this episode from the beginning...,1
22378,This is one of those movies that's trying to b...,0
12162,I have seen this film numerous times and for t...,1
4879,Let's face it: the final season (#8) was one o...,1
12710,There is an excellent reason Edison went strai...,0
24595,"Boy, this was one lousy movie! While I haven't...",0
308,"""House Of Games"" is definitely not without its...",1
4343,"A series of random, seemingly insignificant th...",1
18230,There's enough star power in THE HOUSE OF SPIR...,0


Последние 10 рецензий тренировочного набора

In [203]:
df_train.tail(10)

Unnamed: 0,review,label
22258,The film had some likable aspects. Perhaps too...,0
20757,This movie makes Peter an elf in Robin Hood co...,0
24275,This film was terrible. I have given it the hi...,0
9225,"Don't mistake ""War Inc."" for a sharply chisele...",1
21243,Okay. So I just got back. Before I start my re...,0
13123,"I saw this movie in its own time period, when ...",0
19648,I expected to enjoy a romantic comedy featurin...,0
9845,"""I have looked into the eye of this island, an...",1
10799,A team of archaeologists uncover a real treasu...,1
2732,Whenever I see most reviews it's called 'a mis...,1


In [194]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

st_words = stopwords.words('english')

snowball = SnowballStemmer('english')

def preprocessor(text):
    text = re.sub(r'<[^>]*>', '', text)
    emotes = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub(r'[\W]+', ' ', text.lower())) + ' '.join(emotes).replace('-', '')
    st_words = stopwords.words('english')
    text = ' '.join([snowball.stem(w) for w in text.split() if w not in st_words])
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [162]:
df_train['review'] = df_train['review'].apply(preprocessor)
df_test['review'] = df_test['review'].apply(preprocessor)

In [163]:
x_train = df_train['review']
y_train = df_train['label']
x_test = df_test['review']
y_test = df_test['label']

In [205]:
vectorizer = TfidfVectorizer(preprocessor=None, strip_accents=None, lowercase=False, ngram_range=(1,1))

x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [223]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid1 = [{'penalty': ['l1', 'l2'],
                'C': [0.1, 1.0, 10.0, 100.0]}]

CV_log_res = GridSearchCV(estimator=LogisticRegression(solver='liblinear'), param_grid=param_grid1, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
CV_log_res.fit(x_train_tfidf, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [224]:
print('Лучший набор параметров: %s ' % CV_log_res.best_params_)
print('Правильность при перекрестной проверке: %.3f ' % CV_log_res.best_score_)
log_res_model = CV_log_res.best_estimator_
print('Правильность при испытании: %.3f ' % log_res_model.score(x_test_tfidf, y_test))

Лучший набор параметров: {'C': 10.0, 'penalty': 'l2'} 
Правильность при перекрестной проверке: 0.894 
Правильность при испытании: 0.876 


In [211]:
from sklearn.metrics import classification_report

In [225]:
log_res_pred = log_res_model.predict(x_test_tfidf)
print(classification_report(log_res_pred, y_test))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     12771
           1       0.87      0.88      0.87     12229

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [241]:
pickle.dump(log_res_model, open('log_res_model.sav', 'wb'))

In [242]:
from sklearn.linear_model import SGDClassifier

param_grid2 = {
    'alpha': [0.000001,0.00001,0.0001,0.001,0.01],
    'penalty': ['l1','l2'],
    'loss': ['modified_huber','log_loss']
}

CV_sgd = GridSearchCV(estimator=SGDClassifier(), param_grid=param_grid2, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
CV_sgd.fit(x_train_tfidf, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [227]:
print('Лучший набор параметров: %s ' % CV_sgd.best_params_)
print('Правильность при перекрестной проверке: %.3f ' % CV_sgd.best_score_)
sgd_model = CV_sgd.best_estimator_
print('Правильность при испытании: %.3f ' % sgd_model.score(x_test_tfidf, y_test))

Лучший набор параметров: {'alpha': 1e-05, 'loss': 'log_loss', 'penalty': 'l2'} 
Правильность при перекрестной проверке: 0.894 
Правильность при испытании: 0.881 


In [228]:
sgd_pred = sgd_model.predict(x_test_tfidf)
print(classification_report(sgd_pred, y_test))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     12737
           1       0.87      0.89      0.88     12263

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [229]:
pickle.dump(sgd_model, open('sgd_model.sav', 'wb'))
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid3 = {
    'n_estimators': [100,200,300],
    'max_depth': [None,5,15,30],
    'min_samples_split': [2,5,10]
}

CV_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid3, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
CV_rf.fit(x_train_tfidf, y_train)

In [None]:
print('Лучший набор параметров: %s ' % CV_rf.best_params_)
print('Правильность при перекрестной проверке: %.3f ' % CV_rf.best_score_)
rf_model = CV_rf.best_estimator_
print('Правильность при испытании: %.3f ' % rf_model.score(x_test_tfidf, y_test))

In [None]:
rf_pred = rf_model.predict(x_test_tfidf)
print(classification_report(rf_pred, y_test))