In [41]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.neural_network import MLPClassifier
import gensim
import numpy as np
import pickle
!pip install imblearn


Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.8.0 imblearn-0.0


Выгружаем данные

In [42]:
data_train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'Id'])
data_test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col='Id')
data_id = pd.read_csv('products_sentiment_sample_submission.csv', sep=',', index_col='Id')


In [43]:
data_train.groupby(by='Id')['text'].count()

Id
0     726
1    1274
Name: text, dtype: int64

In [44]:
def cls_pipeline(vect, cls):
    return Pipeline([
                   ('vectorizer', vect),
                   ('classifier', cls)])

In [45]:
train_data = data_train['text']
train_label = data_train['Id']

Baseline признаки на частотах слов и логистическая регрессия

In [46]:
cv_result_bl = cross_val_score(cls_pipeline(CountVectorizer(), LogisticRegression(class_weight='balanced')),train_data, train_label, scoring='accuracy', cv=5)

In [47]:
print(f"Mean accuracy baseline: {cv_result_bl.mean()}")

Mean accuracy baseline: 0.7645000000000001


Удалим стоп слова 

In [48]:
cv_result_sw = cross_val_score(cls_pipeline(CountVectorizer(stop_words='english'), 
                                            LogisticRegression()),
                                            train_data, train_label, scoring='accuracy', cv=5)

In [49]:
print(f"Mean accuracy without stop words: {cv_result_sw.mean()}")

Mean accuracy without stop words: 0.748


Качество не изменилось

Используем tfidf

In [50]:
cv_result_tfidf = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                              LogisticRegression()),
                                              train_data, train_label, scoring='accuracy', cv=5)

In [51]:
print(f"Mean accuracy tfidf :{cv_result_tfidf.mean()}")

Mean accuracy tfidf :0.766


Попробуем использовать LinearSVC, SGDClassifier 

In [52]:
linear_model_list = [SGDClassifier(), LinearSVC()]

In [53]:
cv_other_linear_models = []
for model in linear_model_list:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          model),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_other_linear_models.append(cv_result.mean())

In [54]:
print(f"Mean accuracy SGDClassifier: {cv_other_linear_models[0]}")
print(f"Mean accuracy LinearSVC: {cv_other_linear_models[1]}")

Mean accuracy SGDClassifier: 0.753
Mean accuracy LinearSVC: 0.7689999999999999


Решающие деревья

In [55]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [56]:
cv_forests = []
for forest in [RandomForestClassifier(), GradientBoostingClassifier()]:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          forest),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_forests.append(cv_result.mean())


In [57]:
print(f"Mean accuracy RFC: {cv_forests[0]}")
print(f"Mean accuracy XGB: {cv_forests[1]}")

Mean accuracy RFC: 0.7314999999999999
Mean accuracy XGB: 0.7304999999999999


Как видно все модели по умолчанию выдают примерно одинаковое качесвто около 0.75
Возьмем одну из них, например, логистическую регрессиб и поработаем с признаковым пространством.

Ипользуем n_gramm

In [58]:
cv_result_ngram = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(1,3)), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [59]:
print(f"mean accuracy with ngram for 1 to 3: {cv_result_ngram.mean()}")

mean accuracy with ngram for 1 to 3: 0.7645


Буквенные n_gramm

In [60]:
cv_result_ngramchar = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(3,6), analyzer='char'), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [61]:
print(f"mean accuracy with ngram char for 1 to 3: {cv_result_ngramchar.mean()}")

mean accuracy with ngram char for 1 to 3: 0.771


Дополнительные преобразования после векторизации


In [62]:
def cls_pipeline_transform(vect, trans, cls):
  return Pipeline([
                   ('vectorizer', vect),
                   ('transformer', trans),
                   ('classifier', cls)
                  ])

In [63]:
vect_data = CountVectorizer().fit_transform(train_data)
vect_data.shape

(2000, 3973)

Всего получается около 4000 признаков попробуем провести понижение размерности признакового пространства.

In [64]:
cv_result_trans = cross_val_score(cls_pipeline_transform(CountVectorizer(),
                                                 TruncatedSVD(n_components=1000),
                                                 LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [65]:
cv_result_trans.mean()

0.7659999999999999

Попробуем построение частотных n-грамм с tfidf преобразованием и LinearSVC

In [66]:
cv_result_countt = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_data, train_label, scoring='accuracy', cv=5)

In [67]:
cv_result_countt.mean()

0.79

Получилось наилучшее качество из всех рассмотренных вариантов

Далее будем использовать этот pipeline кроме отдельных случаев


Попробуем трюк с добавление частицы не в начало слова, след. функкция реализует это.

In [68]:
def add_neg(review):
  neg_chars = {"dont", "nt", "n't", "doesnt", "does'nt", "'t", "not", "no"}
  words = review.split(' ')
  new_review = []
  i = 0
  while i < len(words):
    if words[i] in neg_chars:
      new_review.append(words[i] + "_" + words[i+1] )
      i+= 2
    else:
      new_review.append(words[i])
      i+= 1
  return ' '.join(new_review)


In [69]:
train_neg = train_data.apply(add_neg)

In [70]:
cv_result_addneg = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_neg, train_label, scoring='accuracy', cv=5)

In [71]:
print(cv_result_addneg.mean())

0.7825


Представим текст в виде векторов word2vec


In [72]:
all_stopwords = gensim.parsing.preprocessing.STOPWORDS # стоп слова 

In [73]:
list_words = train_data.apply(lambda x: [word for word in x.split() if word not in all_stopwords]) #токенизация предложений и фильтрация по стоп словам
list_words

0                    [2, ., 10,000, 640x480, pictures, .]
1       [downloaded, trial, version, associates, ez, f...
2       [wrt54g, plus, hga7t, perfect, solution, need,...
3       [dont, especially, like, music, files, unstruc...
4       [cheapie, pail, ..., worked, ok, opening, devi...
                              ...                        
1995    [speaker, phone, quality, good, ,, poping, 512...
1996                        [", movies, ", 5, seconds, .]
1997                                   [overall, like, .]
1998    [began, taking, pics, soon, got, camera, amaze...
1999      [reading, instructions, ,, 's, hard, figure, .]
Name: text, Length: 2000, dtype: object

In [74]:
# построение и обучение модели векторизации слов
model_w2v = gensim.models.Word2Vec(size=1000, min_count=10)
model_w2v.build_vocab(list_words)
model_w2v.train(list_words,total_examples=model_w2v.corpus_count, epochs=1)

(7819, 20540)

In [75]:
# функция которая переводит весь текст в один вектор по векторам слов усредняя их
def creat_vect_text(model, words_list):
  all_vect = np.array([model.wv[word] for word in words_list if word in model.wv])
  return all_vect.mean(axis=0)

In [76]:
train_vect_data = [creat_vect_text(model_w2v, text) for text in list_words]
data_text_vect = pd.DataFrame(train_vect_data)
data_text_vect.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-7.7e-05,-0.0004,0.000388,0.000795,0.000295,0.000162,-0.000147,0.001262,0.000962,-0.00158,...,-0.000127,-0.001365,-0.000443,-0.000681,-0.001418,0.000743,-0.000876,-0.000801,0.001088,-0.00064
1,-9.2e-05,-7.5e-05,0.000143,0.000579,0.000256,0.000326,-9.3e-05,0.000803,0.000308,-0.000757,...,-5.3e-05,-0.000741,-0.000365,-0.000226,-0.000846,0.000288,-0.000431,-0.000497,0.000632,-0.000442
2,-0.000214,-0.000152,0.000354,0.000209,0.000174,0.00024,-0.000242,0.000681,0.000287,-0.000794,...,-0.000172,-0.000856,-0.000203,-8.8e-05,-0.000825,0.000122,-0.000466,-0.000314,0.000599,-9.9e-05
3,-0.000114,-1.2e-05,0.000222,0.000565,6.6e-05,0.000438,-0.000172,0.000888,0.000568,-0.001373,...,-6.4e-05,-0.001482,-0.000499,-0.000365,-0.001117,0.000317,-0.00109,-0.000652,0.00113,-0.000472
4,8.6e-05,-6.7e-05,4.8e-05,0.000432,0.00036,0.000137,1.4e-05,0.000715,0.000148,-0.00081,...,-8.6e-05,-0.00073,-0.000119,-0.000201,-0.000676,0.000413,-0.000401,-0.000228,0.00061,-0.000222


In [77]:
cv_result_vect_lr = cross_val_score(LogisticRegression(),data_text_vect, train_label, scoring='accuracy', cv=5)

In [78]:
cv_result_vect_lr.mean()

0.6369999999999999

Балансировка классов будстрапом

In [81]:
from imblearn.over_sampling import RandomOverSampler
X = train_data.to_numpy().reshape(-1,1)
X_b, y_b = RandomOverSampler().fit_resample(X, train_label)
print(np.sum(y_b==1))
print(np.sum(y_b==0))

1274
1274


In [82]:
cv_result_balance = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          X_b.reshape(1,X_b.shape[0])[0], y_b, scoring='accuracy', cv=5)

cv_result_balance.mean()

0.8724742863746677

Лучший вариант с качеством выше 0.8. Обучим pipeline на всех данных и посмотрим на качество тестовых данных


In [83]:
model_cls = cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                   TfidfTransformer(),
                                   LinearSVC())
model_cls.fit(X_b.reshape(1,X_b.shape[0])[0], y_b)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 3))),
                ('transformer', TfidfTransformer()),
                ('classifier', LinearSVC())])

In [84]:
print(f"Mean accuracy: {metrics.accuracy_score(y_b, model_cls.predict(X_b.reshape(1,X_b.shape[0])[0]))}")

Mean accuracy: 1.0


Прогноз на тестовой выборке




In [85]:
predicted = model_cls.predict(data_test['text'])
data_id['y'] = predicted
data_id.head()

Unnamed: 0_level_0,y
Id,Unnamed: 1_level_1
0,1
1,0
2,1
3,1
4,0


In [86]:
data_id.to_csv('sample_sabmission.csv')

Сохранение модели

In [88]:
vectorizer = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                      ('trans', TfidfTransformer())])

In [None]:
vectorizer