In [89]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.neural_network import MLPClassifier
import gensim
import numpy as np

Выгружаем данные

In [2]:
data_train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'Id'])
data_test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col='Id')
data_id = pd.read_csv('products_sentiment_sample_submission.csv', sep=',', index_col='Id')


In [3]:
data_train.groupby(by='Id')['text'].count()

Id
0     726
1    1274
Name: text, dtype: int64

In [4]:
def cls_pipeline(vect, cls):
  return Pipeline([
                   ('vectorizer', vect),
                   ('classifier', cls)])

In [5]:
train_data = data_train['text']
train_label = data_train['Id']

Baseline признаки на частотах слов и логистическая регрессия

In [6]:
cv_result_bl = cross_val_score(cls_pipeline(CountVectorizer(), LogisticRegression(class_weight='balanced')),train_data, train_label, scoring='accuracy', cv=5)

In [7]:
print(f"Mean accuracy baseline: {cv_result_bl.mean()}")

Mean accuracy baseline: 0.7645000000000001


Удалим стоп слова 

In [8]:
cv_result_sw = cross_val_score(cls_pipeline(CountVectorizer(stop_words='english'), 
                                            LogisticRegression()),
                                            train_data, train_label, scoring='accuracy', cv=5)

In [9]:
print(f"Mean accuracy without stop words: {cv_result_sw.mean()}")

Mean accuracy without stop words: 0.748


Качество не изменилось

Используем tfidf

In [10]:
cv_result_tfidf = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                              LogisticRegression()),
                                              train_data, train_label, scoring='accuracy', cv=5)

In [11]:
print(f"Mean accuracy tfidf :{cv_result_tfidf.mean()}")

Mean accuracy tfidf :0.766


Попробуем использовать LinearSVC, SGDClassifier 

In [12]:
linear_model_list = [SGDClassifier(), LinearSVC()]

In [13]:
cv_other_linear_models = []
for model in linear_model_list:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          model),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_other_linear_models.append(cv_result.mean())

In [14]:
print(f"Mean accuracy SGDClassifier: {cv_other_linear_models[0]}")
print(f"Mean accuracy LinearSVC: {cv_other_linear_models[1]}")

Mean accuracy SGDClassifier: 0.7555000000000001
Mean accuracy LinearSVC: 0.7689999999999999


Решающие деревья

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [16]:
cv_forests = []
for forest in [RandomForestClassifier(), GradientBoostingClassifier()]:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          forest),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_forests.append(cv_result.mean())


In [17]:
print(f"Mean accuracy RFC: {cv_forests[0]}")
print(f"Mean accuracy XGB: {cv_forests[1]}")

Mean accuracy RFC: 0.7335
Mean accuracy XGB: 0.7275


Как видно все модели по умолчанию выдают примерно одинаковое качесвто около 0.75
Возьмем одну из них, например, логистическую регрессиб и поработаем с признаковым пространством.

Ипользуем n_gramm

In [18]:
cv_result_ngram = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(1,3)), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [19]:
print(f"mean accuracy with ngram for 1 to 3: {cv_result_ngram.mean()}")

mean accuracy with ngram for 1 to 3: 0.7645


Буквенные n_gramm

In [20]:
cv_result_ngramchar = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(3,6), analyzer='char'), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [21]:
print(f"mean accuracy with ngram char for 1 to 3: {cv_result_ngramchar.mean()}")

mean accuracy with ngram char for 1 to 3: 0.771


Дополнительные преобразования после векторизации


In [22]:
def cls_pipeline_transform(vect, trans, cls):
  return Pipeline([
                   ('vectorizer', vect),
                   ('transformer', trans),
                   ('classifier', cls)
                  ])

In [23]:
vect_data = CountVectorizer().fit_transform(train_data)
vect_data.shape

(2000, 3973)

Всего получается около 4000 признаков попробуем провести понижение размерности признакового пространства.

In [45]:
cv_result_trans = cross_val_score(cls_pipeline_transform(CountVectorizer(),
                                                 TruncatedSVD(n_components=1000),
                                                 LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [46]:
cv_result_trans.mean()

0.7670000000000001

Попробуем построение частотных n-грамм с tfidf преобразованием и LinearSVC

In [66]:
cv_result_countt = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_data, train_label, scoring='accuracy', cv=5)

In [67]:
cv_result_countt.mean()

0.79

Получилось наилучшее качество из всех рассмотренных вариантов

Далее будем использовать этот pipeline кроме отдельных случаев


Попробуем трюк с добавление частицы не в начало слова, след. функкция реализует это.

In [28]:
def add_neg(review):
  neg_chars = {"dont", "nt", "n't", "doesnt", "does'nt", "'t", "not", "no"}
  words = review.split(' ')
  new_review = []
  i = 0
  while i < len(words):
    if words[i] in neg_chars:
      new_review.append(words[i] + "_" + words[i+1] )
      i+= 2
    else:
      new_review.append(words[i])
      i+= 1
  return ' '.join(new_review)


In [29]:
train_neg = train_data.apply(add_neg)

In [68]:
cv_result_addneg = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_neg, train_label, scoring='accuracy', cv=5)

In [69]:
print(cv_result_addneg.mean())

0.7825


Представим текст в виде векторов word2vec


In [32]:
all_stopwords = gensim.parsing.preprocessing.STOPWORDS # стоп слова 

In [33]:
list_words = train_data.apply(lambda x: [word for word in x.split() if word not in all_stopwords]) #токенизация предложений и фильтрация по стоп словам
list_words

0                    [2, ., 10,000, 640x480, pictures, .]
1       [downloaded, trial, version, associates, ez, f...
2       [wrt54g, plus, hga7t, perfect, solution, need,...
3       [dont, especially, like, music, files, unstruc...
4       [cheapie, pail, ..., worked, ok, opening, devi...
                              ...                        
1995    [speaker, phone, quality, good, ,, poping, 512...
1996                        [", movies, ", 5, seconds, .]
1997                                   [overall, like, .]
1998    [began, taking, pics, soon, got, camera, amaze...
1999      [reading, instructions, ,, 's, hard, figure, .]
Name: text, Length: 2000, dtype: object

In [34]:
# построение и обучение модели векторизации слов
model_w2v = gensim.models.Word2Vec(size=1000, min_count=10)
model_w2v.build_vocab(list_words)
model_w2v.train(list_words,total_examples=model_w2v.corpus_count, epochs=1)

(7814, 20540)

In [35]:
# функция которая переводит весь текст в один вектор по векторам слов усредняя их
def creat_vect_text(model, words_list):
  all_vect = np.array([model.wv[word] for word in words_list if word in model.wv])
  return all_vect.mean(axis=0)

In [36]:
train_vect_data = [creat_vect_text(model_w2v, text) for text in list_words]
data_text_vect = pd.DataFrame(train_vect_data)
data_text_vect.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999
0,0.000143,-0.000556,-0.000825,-0.001271,0.000543,0.000903,0.000597,0.00171,-0.000749,0.000313,0.001023,0.000488,-0.000802,-0.000451,-0.000274,0.000339,-7.2e-05,-0.001603,-0.000403,-0.000285,0.000512,0.000136,0.001031,-0.002059,0.000345,-0.000119,0.00043,0.001456,-0.000832,0.000407,0.001535,-0.001456,-0.001026,0.000367,-0.000161,-0.00027,-0.001063,-0.000693,0.001183,-0.000808,...,-0.000228,-0.000433,-0.000919,-0.001015,0.001263,-0.00153,-0.001866,0.000812,-0.000949,-0.000261,0.000859,0.00013,0.000629,-0.000622,0.00012,0.001433,-0.00151,-0.000643,-0.001335,0.001903,-0.001689,-0.000137,0.000247,0.000756,-0.001151,0.000633,-0.000951,-0.001266,-0.000386,-0.001024,0.00034,0.001061,0.000601,-0.001858,0.00044,0.001541,0.00041,-0.001011,-0.000126,-0.000147
1,0.000122,-0.000319,-0.000234,-0.000619,0.000118,0.000401,0.000477,0.001094,-0.000398,0.000205,0.000418,0.000208,-0.000436,-6.7e-05,-0.000385,4.1e-05,-1.2e-05,-0.000854,-0.000113,-0.000152,0.000376,4.4e-05,0.000598,-0.001046,0.000242,3.4e-05,0.000185,0.000713,-0.000385,0.000467,0.000837,-0.000994,-0.000808,1.3e-05,-0.000185,-0.000333,-0.000723,-0.00028,0.000837,-0.000493,...,-0.000266,-0.000231,-0.000679,-0.000869,0.000536,-0.000546,-0.000882,0.000696,-0.000434,-0.000152,0.000385,3.1e-05,0.000361,-0.000318,0.000182,0.000918,-0.000648,-0.000336,-0.000666,0.000937,-0.000942,-0.000164,7e-06,0.000443,-0.000645,0.000574,-0.00069,-0.000713,-0.000188,-0.000475,0.000192,0.000789,0.000262,-0.001095,0.000262,0.000735,9e-05,-0.000653,-0.00012,-2e-05
2,0.000192,-0.000339,-0.00018,-0.000739,0.00013,0.00057,5.1e-05,0.000781,-0.000271,-2e-06,0.000553,0.00022,-0.00032,-0.000219,-0.000363,0.000115,-0.000168,-0.000786,-0.000326,-0.00024,0.000235,-3.4e-05,0.000282,-0.000807,-5.4e-05,-7.1e-05,0.000254,0.00059,-0.000603,0.00045,0.000815,-0.000712,-0.000686,0.000147,5.7e-05,-0.000158,-0.000481,-0.000108,0.000687,-0.000533,...,-0.000323,-0.000276,-0.000464,-0.000496,0.00056,-0.000744,-0.000967,0.000385,-0.000308,-0.000136,0.000165,4.1e-05,0.000299,-0.000305,0.000102,0.000668,-0.000839,-0.000312,-0.0006,0.000909,-0.000804,-0.000173,0.000219,0.000601,-0.000347,0.000446,-0.000472,-0.000589,-0.000424,-0.00045,0.000369,0.000394,0.00016,-0.000946,0.000152,0.000865,0.000222,-0.00042,-6.4e-05,-2e-05
3,0.000352,-0.000756,-0.000471,-0.001123,0.000366,0.000758,0.000549,0.001469,-0.000527,0.00032,0.000596,9.2e-05,-0.000465,-0.000444,-0.000228,-0.000124,-0.000106,-0.00127,-0.000399,-0.000378,0.000186,3e-05,0.000566,-0.001609,0.000189,-0.000142,0.000281,0.001182,-0.001013,0.000453,0.001295,-0.001445,-0.000944,-4.3e-05,0.00015,-3.9e-05,-0.00103,-0.000338,0.000975,-0.000865,...,-0.000425,-0.000602,-0.00083,-0.000917,0.000882,-0.001116,-0.001386,0.000539,-0.000539,-7.3e-05,0.000495,0.000117,0.000311,-0.00043,0.000387,0.001559,-0.001047,-0.000323,-0.000979,0.001599,-0.001427,-0.000364,8.4e-05,0.000887,-0.000758,0.000463,-0.000487,-0.001065,-0.000598,-0.000755,0.000417,0.00093,0.000347,-0.001714,0.000417,0.001526,0.000329,-0.001004,-0.000124,0.000205
4,0.00016,-0.000145,-0.000333,-0.00052,0.000244,0.000302,0.000188,0.000753,-0.000247,0.000144,0.000508,0.000102,-0.000512,-0.000133,3e-05,0.000147,-0.000118,-0.000698,-0.000231,-0.000173,-3e-06,-1.7e-05,0.000245,-0.000814,0.000187,-6.6e-05,0.000257,0.000741,-0.000559,0.000501,0.000421,-0.000839,-0.000478,0.000112,-1.1e-05,8.4e-05,-0.000518,-5.5e-05,0.000423,-0.000529,...,-0.000176,-0.000236,-0.000342,-0.000374,0.000467,-0.000671,-0.000789,0.000351,-0.000289,-0.000124,0.000382,0.000177,0.00027,-0.000352,0.000137,0.000594,-0.000618,-0.000392,-0.00061,0.000837,-0.000814,-0.000233,0.000223,0.000512,-0.000517,0.000328,-0.000445,-0.000517,-0.000119,-0.000556,0.000214,0.00044,0.000109,-0.000925,0.000145,0.000736,1.1e-05,-0.000438,-0.000164,-2.1e-05


In [64]:
cv_result_vect_lr = cross_val_score(LogisticRegression(),data_text_vect, train_label, scoring='accuracy', cv=5)

In [65]:
cv_result_vect_lr.mean()

0.6369999999999999

Балансировка классов будстрапом

In [95]:
from imblearn.over_sampling import RandomOverSampler
X = train_data.to_numpy().reshape(-1,1)
X_b, y_b = RandomOverSampler().fit_sample(X, train_label)
print(np.sum(y_b==1))
print(np.sum(y_b==0))

1274
1274




In [87]:
cv_result_balance = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          X_b.reshape(1,X_b.shape[0])[0], y_b, scoring='accuracy', cv=5)

cv_result_balance.mean()

0.8771817096190146

Лучший вариант с качеством выше 0.8. Обучим pipeline на всех данных и посмотрим на качество тестовых данных


In [97]:
model_cls = cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                   TfidfTransformer(),
                                   LinearSVC())
model_cls.fit(X_b.reshape(1,X_b.shape[0])[0], y_b)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
  

In [99]:
print(f"Mean accuracy: {metrics.accuracy_score(y_b, model_cls.predict(X_b.reshape(1,X_b.shape[0])[0]))}")

mean accuracy: 1.0


In [100]:
data_test

Unnamed: 0_level_0,text
Id,Unnamed: 1_level_1
0,"so , why the small digital elph , rather than ..."
1,3/4 way through the first disk we played on it...
2,better for the zen micro is outlook compatibil...
3,6 . play gameboy color games on it with goboy .
4,"likewise , i 've heard norton 2004 professiona..."
...,...
495,i took perfect care of this player and still i...
496,it 's a very intuitive program .
497,the only drawback is the viewfinder is slightl...
498,"it films 10 second video , for crying out loud ."


In [101]:
data_id

Unnamed: 0_level_0,y
Id,Unnamed: 1_level_1
0,0
1,1
2,0
3,1
4,0
...,...
495,1
496,0
497,1
498,0
