In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, StratifiedKFold, StratifiedShuffleSplit, GridSearchCV
from sklearn import grid_search
from sklearn.pipeline import Pipeline
from scipy import sparse

  from collections import Sequence
  from numpy.core.umath_tests import inner1d
  from collections import Mapping, namedtuple, Sized


Загрузим выборки:

In [2]:
train = pd.read_csv('products_sentiment_train.tsv', sep='\t', names=['reviews', 'label'])
test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col=['Id'])

In [3]:
train.head()

Unnamed: 0,reviews,label
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [4]:
test.head()

Unnamed: 0_level_0,text
Id,Unnamed: 1_level_1
0,"so , why the small digital elph , rather than ..."
1,3/4 way through the first disk we played on it...
2,better for the zen micro is outlook compatibil...
3,6 . play gameboy color games on it with goboy .
4,"likewise , i 've heard norton 2004 professiona..."


Посмотрим размер обучающей выборки и долю положительных отзывов.

In [5]:
len(train)

2000

In [6]:
sum(train.label) / len(train)

0.637

Видим, что положительных отзывов чуть больше.

## Подбор модели

Переберем несколько методов извлечения признаков из текстов и классификаторов, выберем несколько наиболее удачных пайплайнов.

In [7]:
def review_classifier(name_method_list):
    return Pipeline([(name, method) for (name, method) in name_method_list])

In [8]:
scores = []
for vct, v_name in zip([CountVectorizer(), TfidfVectorizer()], 
                       ['CountVectorizer', 'TfidfVectorizer']):
    for clf, c_name in zip([LogisticRegression(random_state=0), 
                            LinearSVC(random_state=0), 
                            SGDClassifier(max_iter=1000, random_state=0), 
                            MultinomialNB(), 
                            RandomForestClassifier(random_state=0), 
                            GradientBoostingClassifier(random_state=0)], 
                           ['LogisticRegression', 
                            'LinearSVC', 
                            'SGDClassifier', 
                            'MultinomialNB', 
                            'RandomForestClassifier', 
                            'GradientBoostingClassifier']):
        score = cross_val_score(review_classifier([(v_name, vct), (c_name, clf)]), train.reviews, train.label).mean()
        scores.append((f'{v_name} + {c_name}', score))
        print(f'vectorizer: {v_name}, classifier: {c_name}')
        print(score)

vectorizer: CountVectorizer, classifier: LogisticRegression
0.7740071405738572
vectorizer: CountVectorizer, classifier: LinearSVC
0.7505076290683487
vectorizer: CountVectorizer, classifier: SGDClassifier
0.7350101225663445
vectorizer: CountVectorizer, classifier: MultinomialNB
0.7795006400703551
vectorizer: CountVectorizer, classifier: RandomForestClassifier
0.7194953574263919
vectorizer: CountVectorizer, classifier: GradientBoostingClassifier
0.7320033676855267
vectorizer: TfidfVectorizer, classifier: LogisticRegression
0.7575056315686001
vectorizer: TfidfVectorizer, classifier: LinearSVC
0.7685001343172257
vectorizer: TfidfVectorizer, classifier: SGDClassifier
0.7540021280650966
vectorizer: TfidfVectorizer, classifier: MultinomialNB
0.7005026015520768
vectorizer: TfidfVectorizer, classifier: RandomForestClassifier
0.7115038576807692
vectorizer: TfidfVectorizer, classifier: GradientBoostingClassifier
0.7185048616832725


In [9]:
%%time
for vct, v_name in zip([CountVectorizer(), TfidfVectorizer()], 
                       ['CountVectorizer', 'TfidfVectorizer']):
    for trf, t_name in zip([TfidfTransformer(), NMF(n_components=100), TruncatedSVD(n_components=100)], 
                           ['TfidfTransformer', 'NMF', 'TruncatedSVD']):
        for clf, c_name in zip([LogisticRegression(random_state=0), 
                                LinearSVC(random_state=0), 
                                SGDClassifier(max_iter=1000, random_state=0),  
                                RandomForestClassifier(random_state=0), 
                                GradientBoostingClassifier(random_state=0)], 
                               ['LogisticRegression', 
                                'LinearSVC', 
                                'SGDClassifier', 
                                'RandomForestClassifier', 
                                'GradientBoostingClassifier']):
            score = cross_val_score(review_classifier([(v_name, vct), (t_name, trf), (c_name, clf)]), train.reviews, train.label).mean()
            scores.append((f'{v_name} + {t_name} + {c_name}', score))
            print(f'vectorizer: {v_name}, transformer: {t_name}, classifier: {c_name}')
            print(score)

vectorizer: CountVectorizer, transformer: TfidfTransformer, classifier: LogisticRegression
0.7575056315686001
vectorizer: CountVectorizer, transformer: TfidfTransformer, classifier: LinearSVC
0.7685001343172257
vectorizer: CountVectorizer, transformer: TfidfTransformer, classifier: SGDClassifier
0.7540021280650966
vectorizer: CountVectorizer, transformer: TfidfTransformer, classifier: RandomForestClassifier
0.7115038576807692
vectorizer: CountVectorizer, transformer: TfidfTransformer, classifier: GradientBoostingClassifier
0.7185048616832725
vectorizer: CountVectorizer, transformer: NMF, classifier: LogisticRegression
0.6644943294118707
vectorizer: CountVectorizer, transformer: NMF, classifier: LinearSVC
0.6985021003012007
vectorizer: CountVectorizer, transformer: NMF, classifier: SGDClassifier
0.698495346921134
vectorizer: CountVectorizer, transformer: NMF, classifier: RandomForestClassifier
0.6925208566887727
vectorizer: CountVectorizer, transformer: NMF, classifier: GradientBoosting

In [10]:
scores.sort(key=lambda x: x[1], reverse=True)
scores

[('CountVectorizer + MultinomialNB', 0.7795006400703551),
 ('CountVectorizer + LogisticRegression', 0.7740071405738572),
 ('TfidfVectorizer + LinearSVC', 0.7685001343172257),
 ('CountVectorizer + TfidfTransformer + LinearSVC', 0.7685001343172257),
 ('TfidfVectorizer + TfidfTransformer + LinearSVC', 0.7614976295635966),
 ('TfidfVectorizer + LogisticRegression', 0.7575056315686001),
 ('CountVectorizer + TfidfTransformer + LogisticRegression',
  0.7575056315686001),
 ('TfidfVectorizer + TruncatedSVD + SGDClassifier', 0.7549993771882827),
 ('TfidfVectorizer + SGDClassifier', 0.7540021280650966),
 ('CountVectorizer + TfidfTransformer + SGDClassifier', 0.7540021280650966),
 ('TfidfVectorizer + TruncatedSVD + LinearSVC', 0.7520001260630945),
 ('CountVectorizer + TruncatedSVD + LinearSVC', 0.7519956238097167),
 ('CountVectorizer + TruncatedSVD + LogisticRegression', 0.7514988751870311),
 ('CountVectorizer + LinearSVC', 0.7505076290683487),
 ('CountVectorizer + TruncatedSVD + SGDClassifier', 0.

Стоит рассмотреть следующие пайплайны:

* CountVectorizer + MultinomialNB
* CountVectorizer + LogisticRegression
* TfidfVectorizer + LinearSVC
* TfidfVectorizer + TruncatedSVD + SGDClassifier
* TfidfVectorizer + TfidfTransformer + LinearSVC
* TfidfVectorizer + LogisticRegression

Применим поиск параметров по сетке.

In [11]:
def search_by_grid(X, y, model, parameters_grid, cv, scoring='accuracy', verbose=2):
    grid_cv = GridSearchCV(model, parameters_grid, scoring=scoring, cv=cv, verbose=verbose)
    grid_cv.fit(X, y)
    return grid_cv

In [12]:
def search_by_models(X, y, grid_cv_params):
    grid_cvs = []
    for params in grid_cv_params:
        grid_cvs.append(search_by_grid(X, y, *params))
    return grid_cvs

In [13]:
grid_cv_params = [[Pipeline([('vectorizer', CountVectorizer()), ('classifier', MultinomialNB())]),
                   {
                        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'classifier__alpha': [0.5, 0.7, 1.0, 10, 100],
                        'classifier__fit_prior': [True, False]
                    }, StratifiedKFold(), 'accuracy', 1],
                  [Pipeline([('vectorizer', CountVectorizer()), ('classifier', LogisticRegression())]), 
                   {
                        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'classifier__C': [0.5, 0.7, 1.0, 10, 100, 1000],
                        'classifier__penalty': ['l1', 'l2'],
                        'classifier__solver': ['liblinear']
                   }, StratifiedKFold(), 'accuracy', 1],
                  [Pipeline([('vectorizer', CountVectorizer()), ('classifier', LogisticRegression(n_jobs=-1))]), 
                   {
                        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'classifier__C': [0.5, 0.7, 1.0, 10, 100, 1000],
                        'classifier__penalty': ['l2'],
                        'classifier__solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
                   }, StratifiedKFold(), 'accuracy', 1],
                  [Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC(random_state=0))]), 
                   {
                        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__norm': ['l1', 'l2', None],
                        'vectorizer__use_idf': [True, False],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'classifier__C': [0.5, 0.7, 1.0, 10, 100, 1000],
                   }, StratifiedKFold(), 'accuracy', 1], 
                  [Pipeline([('vectorizer', TfidfVectorizer()), 
                             ('transformer', TruncatedSVD()), 
                             ('classifier', SGDClassifier(max_iter=1000, random_state=0, n_jobs=-1))]), 
                   {
                        #'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__norm': ['l1', 'l2', None],
                        'vectorizer__use_idf': [True, False],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'transformer__n_components': [1, 2, 10, 100, 200],
                        'classifier__penalty': ['l1', 'l2', None, 'elasticnet']
                   }, StratifiedKFold(), 'accuracy', 1],
                  [Pipeline([('vectorizer', TfidfVectorizer()), 
                             ('transformer', TfidfTransformer()), 
                             ('classifier', LinearSVC(random_state=0))]), 
                   {
                        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__norm': ['l1', 'l2', None],
                        'vectorizer__use_idf': [True, False],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'transformer__norm': ['l1', 'l2', None],
                        'transformer__use_idf': [True, False],
                        'classifier__C': [0.5, 0.7, 1.0, 10, 100, 1000],
                   }, StratifiedKFold(), 'accuracy', 1],
                  [Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(random_state=0))]),
                   {
                        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__norm': ['l1', 'l2', None],
                        'vectorizer__use_idf': [True, False],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'classifier__C': [0.5, 0.7, 1.0, 10, 100, 1000],
                        'classifier__penalty': ['l1', 'l2'],
                        'classifier__solver': ['liblinear']
                   }, StratifiedKFold(), 'accuracy', 1], 
                  [Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(random_state=0, n_jobs=-1))]),
                   {
                        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
                        'vectorizer__stop_words': [None, 'english'],
                        'vectorizer__ngram_range': [(i, j) for i in range(1, 4) for j in range(1, 4) if j >= i],
                        'vectorizer__norm': ['l1', 'l2', None],
                        'vectorizer__use_idf': [True, False],
                        'vectorizer__max_df': [0.3, 0.5, 0.8, 1.0],
                        'classifier__C': [0.5, 0.7, 1.0, 10, 100, 1000],
                        'classifier__penalty': ['l2'],
                        'classifier__solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
                   }, StratifiedKFold(), 'accuracy', 1]
                 ]

ОСТОРОЖНО! выполнение следующей ячейки требует очень много времени. У меня выполнялась 1 день, 17 часов, 41 минуту!

In [None]:
%%time
grid_cvs = search_by_models(train.reviews, train.label, grid_cv_params)

Fitting 3 folds for each of 1440 candidates, totalling 4320 fits


Лучшие модели:

### CountVectorizer + MultinomialNB:

Параметры

* 'classifier__alpha': 1.0, 
* 'classifier__fit_prior': True, 
* 'vectorizer__analyzer': 'char_wb', 
* 'vectorizer__max_df': 1.0, 
* 'vectorizer__ngram_range': (3, 3), 
* 'vectorizer__stop_words': None

Score: 0.784

Score на тесте: 0.76888

### CountVectorizer + LogisticRegression:

* 'classifier__C': 1.0, 
* 'classifier__penalty': 'l2', 
* 'classifier__solver': 'liblinear', 
* 'vectorizer__analyzer': 'word', 
* 'vectorizer__max_df': 0.8, 
* 'vectorizer__ngram_range': (1, 1), 
* 'vectorizer__stop_words': None

Score: 0.774

### CountVectorizer + LogisticRegression:

* 'classifier__C': 0.5, 
* 'classifier__penalty': 'l2', 
* 'classifier__solver': 'sag', 
* 'vectorizer__analyzer': 'word', 
* 'vectorizer__max_df': 0.8, 
* 'vectorizer__ngram_range': (1, 2), 
* 'vectorizer__stop_words': None

Score: 0.7755

Score на тесте: 0.81333

### TfidfVectorizer + LinearSVC:

* 'classifier__C': 100, 
* 'vectorizer__analyzer': 'word', 
* 'vectorizer__max_df': 0.3, 
* 'vectorizer__ngram_range': (1, 3), 
* 'vectorizer__norm': 'l1', 
* 'vectorizer__stop_words': None, 
* 'vectorizer__use_idf': True

Score: 0.79

#### Score на тесте: 0.81777

### TfidfVectorizer + TruncatedSVD + SGDClassifier:

* 'classifier__penalty': 'l2', 
* 'transformer__n_components': 200, 
* 'vectorizer__max_df': 0.5, 
* 'vectorizer__ngram_range': (1, 2), 
* 'vectorizer__norm': 'l2', 
* 'vectorizer__stop_words': None, 
* 'vectorizer__use_idf': True 

Score: 0.769

Score на тесте: 0.76222

### TfidfVectorizer + TfidfTransformer + LinearSVC:

* 'classifier__C': 10, 
* 'transformer__norm': None, 
* 'transformer__use_idf': True, 
* 'vectorizer__analyzer': 'word', 
* 'vectorizer__max_df': 0.3, 
* 'vectorizer__ngram_range': (1, 3), 
* 'vectorizer__norm': 'l1', 
* 'vectorizer__stop_words': None, 
* 'vectorizer__use_idf': False 

Score: 0.792

Score на тесте: 0.80888

### TfidfVectorizer + LogisticRegression:

* 'classifier__C': 1000, 
* 'classifier__penalty': 'l2', 
* 'classifier__solver': 'liblinear', 
* 'vectorizer__analyzer': 'word', 
* 'vectorizer__max_df': 0.3, 
* 'vectorizer__ngram_range': (1, 3), 
* 'vectorizer__norm': 'l2', 
* 'vectorizer__stop_words': None, 
* 'vectorizer__use_idf': True

Score: 0.786

#### Score на тесте: 0.82000

### TfidfVectorizer + LogisticRegression:

* 'classifier__C': 1000, 
* 'classifier__penalty': 'l2', 
* 'classifier__solver': 'sag', 
* 'vectorizer__analyzer': 'word', 
* 'vectorizer__max_df': 0.3, 
* 'vectorizer__ngram_range': (1, 3), 
* 'vectorizer__norm': 'l2', 
* 'vectorizer__stop_words': None, 
* 'vectorizer__use_idf': True 

Score: 0.786

Score на тесте: 0.80888


Сохраним все посчитанные модели в файл. Выполнение первой ячейки так же занимает существенное время.

In [359]:
%%time
all_models = pd.DataFrame()
for pipe, grid in zip(['CountVectorizer + MultinomialNB', 
                         'CountVectorizer + LogisticRegression', 
                         'CountVectorizer + LogisticRegression', 
                         'TfidfVectorizer + LinearSVC', 
                         'TfidfVectorizer + TruncatedSVD + SGDClassifier', 
                         'TfidfVectorizer + TfidfTransformer + LinearSVC', 
                         'TfidfVectorizer + LogisticRegression', 
                         'TfidfVectorizer + LogisticRegression'], grid_cvs):
    for p in sorted(grid.grid_scores_, key=lambda x: x[1], reverse=True):
        all_models = all_models.append(pd.DataFrame({'pileline': pipe, 'mean': p[1], **p[0]}), sort=False)



Wall time: 58min 34s


In [389]:
all_models.head()

Unnamed: 0,pileline,mean,classifier__alpha,classifier__fit_prior,vectorizer__analyzer,vectorizer__max_df,vectorizer__ngram_range,vectorizer__stop_words,classifier__C,classifier__penalty,classifier__solver,vectorizer__norm,vectorizer__use_idf,transformer__n_components,transformer__norm,transformer__use_idf
0,CountVectorizer + MultinomialNB,0.784,1.0,True,char_wb,1.0,3,,,,,,,,,
1,CountVectorizer + MultinomialNB,0.784,1.0,True,char_wb,1.0,3,,,,,,,,,
0,CountVectorizer + MultinomialNB,0.784,1.0,True,char_wb,1.0,3,english,,,,,,,,
1,CountVectorizer + MultinomialNB,0.784,1.0,True,char_wb,1.0,3,english,,,,,,,,
0,CountVectorizer + MultinomialNB,0.782,1.0,True,char_wb,0.5,3,,,,,,,,,


In [369]:
all_models.to_csv('all_models.csv', index=True)

In [380]:
all_models_upd = all_models.iloc[all_models.index == 0, :]

In [381]:
all_models_upd['vectorizer__ngram_range'] = list(zip(all_models[all_models.index == 0].vectorizer__ngram_range, all_models[all_models.index == 1].vectorizer__ngram_range))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [386]:
all_models_upd.reset_index(drop=True, inplace=True)

In [388]:
all_models_upd.head()

Unnamed: 0,pileline,mean,classifier__alpha,classifier__fit_prior,vectorizer__analyzer,vectorizer__max_df,vectorizer__ngram_range,vectorizer__stop_words,classifier__C,classifier__penalty,classifier__solver,vectorizer__norm,vectorizer__use_idf,transformer__n_components,transformer__norm,transformer__use_idf
0,CountVectorizer + MultinomialNB,0.784,1.0,True,char_wb,1.0,"(3, 3)",,,,,,,,,
1,CountVectorizer + MultinomialNB,0.784,1.0,True,char_wb,1.0,"(3, 3)",english,,,,,,,,
2,CountVectorizer + MultinomialNB,0.782,1.0,True,char_wb,0.5,"(3, 3)",,,,,,,,,
3,CountVectorizer + MultinomialNB,0.782,1.0,True,char_wb,0.5,"(3, 3)",english,,,,,,,,
4,CountVectorizer + MultinomialNB,0.782,1.0,True,char_wb,0.8,"(3, 3)",,,,,,,,,


Файл с моделями приложу к ноутбуку.

In [393]:
all_models_upd.to_csv('all_models_upd.csv', index=False)

Построим предсказания для лучших моделей.

In [403]:
%%time
for pipe, grid in zip(['CountVectorizer + MultinomialNB', 
                         'CountVectorizer + LogisticRegression(liblinear)', 
                         'CountVectorizer + LogisticRegression', 
                         'TfidfVectorizer + LinearSVC', 
                         'TfidfVectorizer + TruncatedSVD + SGDClassifier', 
                         'TfidfVectorizer + TfidfTransformer + LinearSVC', 
                         'TfidfVectorizer + LogisticRegression(liblinear)', 
                         'TfidfVectorizer + LogisticRegression'], grid_cvs):
    answer = pd.DataFrame({'Id': test.index, 'y': grid.best_estimator_.predict(test.text)})
    answer.to_csv(f'submition {pipe}.csv', index=False)

Wall time: 302 ms


Лучший результат получился у модели TfidfVectorizer + LogisticRegression:

* 'classifier__C': 1000,
* 'classifier__penalty': 'l2',
* 'classifier__solver': 'liblinear',
* 'vectorizer__analyzer': 'word',
* 'vectorizer__max_df': 0.3,
* 'vectorizer__ngram_range': (1, 3),
* 'vectorizer__norm': 'l2',
* 'vectorizer__stop_words': None,
* 'vectorizer__use_idf': True

Score: 0.786

Score на тесте: 0.82000

По состоянию на 09.09.2019 с этой моделью получилось подняться на 10 строчку:

![title](score.jpg)

## Больше данных!

Идея: можно попробовать добавить к нашим сэмплам несколько случайно выбранных сэмплов из той же выборки с некоторым шумом в признаках. Возможно так получится улучшить качество модели.

In [86]:
def noise(matrix, labels, pos, neg, feature_noise='count', feature_range=range(1, 11), 
          val_range_count=range(1, 3), val_range_tfidf=np.arange(0.0001, 0.01, 0.001)):
    pos_indices = [i for i, value in enumerate(labels) if value == 1]
    neg_indices = [i for i, value in enumerate(labels) if value == 0]
    feature_indices = list(range(matrix.shape[1]))
    for idx in sorted(np.append(np.random.choice(pos_indices, pos), np.random.choice(neg_indices, neg))):
        labels.append(labels[idx])
        row = matrix[idx].toarray()[0]
        for i in sorted(np.random.choice(feature_indices, np.random.choice(feature_range))):
            if feature_noise == 'count':
                row[i] += np.random.choice(val_range_count)
            elif feature_noise == 'tfidf':
                row[i] += np.random.choice(val_range_tfidf)
        matrix = sparse.vstack((matrix, sparse.csr_matrix(row[np.newaxis, :])))
    return sparse.csr_matrix(matrix), labels

Провери две разные более-менее удачные модели:

TfidfVectorizer + LogisticRegression:

* 'classifier__C': 1000,
* 'classifier__penalty': 'l2',
* 'classifier__solver': 'liblinear',
* 'vectorizer__analyzer': 'word',
* 'vectorizer__max_df': 0.3,
* 'vectorizer__ngram_range': (1, 3),
* 'vectorizer__norm': 'l2',
* 'vectorizer__stop_words': None,
* 'vectorizer__use_idf': True

Score: 0.786

Score на тесте: 0.82000

Score на тесте +2548: 0.81777

CountVectorizer + LogisticRegression:

* 'classifier__C': 1.0, 
* 'classifier__penalty': 'l2', 
* 'classifier__solver': 'liblinear', 
* 'vectorizer__analyzer': 'word', 
* 'vectorizer__max_df': 0.8, 
* 'vectorizer__ngram_range': (1, 1), 
* 'vectorizer__stop_words': None

Score: 0.774

Score на тесте: 0.81333

Score на тесте +2548: 0.78888

In [135]:
def pipeliner(vect_name, vectorizer, clf_name, classifier, train_X, train_y, test_X, pos, neg, feature_noise='count', 
              feature_range=range(1, 11), val_range_count=range(1, 3), val_range_tfidf=np.arange(0.0001, 0.01, 0.001)):
    train_features = vectorizer.fit_transform(train_X, train_y)
    test_features = vectorizer.transform(test_X)
    matrix, labels = noise(train_features, list(train_y), pos, neg, feature_noise=feature_noise, 
                           feature_range=feature_range, val_range_count=val_range_count, 
                           val_range_tfidf=val_range_tfidf)
    classifier.fit(matrix, labels)
    answer = pd.DataFrame({'Id': test.index, 'y': classifier.predict(test_features)})
    answer.to_csv(f'submition {vect_name} + {clf_name}(+{pos + neg}).csv', index=False)
    return cross_val_score(classifier, matrix, labels).mean()

Помним, что выборка не сбалансирована, добавим разное количество позитивных и негативных отзывов.

In [136]:
pipeliner('TfidfVectorizer', TfidfVectorizer(max_df=0.3, ngram_range=(1, 3)), 
          'LogisticRegression', LogisticRegression(C=1000, random_state=0), 
          train.reviews, train.label, test.text, 1000, 1548, feature_noise='tfidf')

0.9696569920844328

In [138]:
pipeliner('CountVectorizer', CountVectorizer(max_df=0.8, ngram_range=(1, 1)), 
          'LogisticRegression', LogisticRegression(random_state=0), 
          train.reviews, train.label, test.text, 1000, 1548, feature_noise='count')

0.8733509234828496

Знаем, что так как модель с TfidfVectorizer использует униграммы, биграммы и триграммы, поэтому изменим диапазон выбранных изменяемых признаков.

In [140]:
pipeliner('TfidfVectorizer', TfidfVectorizer(max_df=0.3, ngram_range=(1, 3)), 
          'LogisticRegression@', LogisticRegression(C=1000, random_state=0), 
          train.reviews, train.label, test.text, 1000, 1548, feature_noise='tfidf', feature_range=range(10, 100))

0.9696569920844328

Видим, что на кросс-валидации получаем многообещающее качество. К сожалению на тесте качество падает, выбор другого диапазона изменяемых признаков так же снизило качество модели до 0.81555. Видимо произошло переобучение.

## Выводы:

1. Очень хорошо показали себя линейные классификаторы, в то время как ансамбли и Наивный Байесовский классификатор выдали качество хуже.
2. Интересно, что в лучший моделях не было стоп слов.
3. Синтезированные данные не дали улучшения в качестве на тесте, хотя на обучении качество росло. Очевидно эффект переобучения.