In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.neural_network import MLPClassifier
import gensim
import numpy as np

Выгружаем данные

In [2]:
data_train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'Id'])
data_test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col='Id')
data_id = pd.read_csv('products_sentiment_sample_submission.csv', sep=',', index_col='Id')


In [3]:
data_train.groupby(by='Id')['text'].count()

Id
0     726
1    1274
Name: text, dtype: int64

In [4]:
def cls_pipeline(vect, cls):
  return Pipeline([
                   ('vectorizer', vect),
                   ('classifier', cls)])

In [5]:
train_data = data_train['text']
train_label = data_train['Id']

Baseline признаки на частотах слов и логистическая регрессия

In [6]:
cv_result_bl = cross_val_score(cls_pipeline(CountVectorizer(), LogisticRegression(class_weight='balanced')),train_data, train_label, scoring='accuracy', cv=5)

In [7]:
print(f"Mean accuracy baseline: {cv_result_bl.mean()}")

Mean accuracy baseline: 0.7645000000000001


Удалим стоп слова 

In [8]:
cv_result_sw = cross_val_score(cls_pipeline(CountVectorizer(stop_words='english'), 
                                            LogisticRegression()),
                                            train_data, train_label, scoring='accuracy', cv=5)

In [9]:
print(f"Mean accuracy without stop words: {cv_result_sw.mean()}")

Mean accuracy without stop words: 0.748


Качество не изменилось

Используем tfidf

In [10]:
cv_result_tfidf = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                              LogisticRegression()),
                                              train_data, train_label, scoring='accuracy', cv=5)

In [11]:
print(f"Mean accuracy tfidf :{cv_result_tfidf.mean()}")

Mean accuracy tfidf :0.766


Попробуем использовать LinearSVC, SGDClassifier 

In [12]:
linear_model_list = [SGDClassifier(), LinearSVC()]

In [13]:
cv_other_linear_models = []
for model in linear_model_list:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          model),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_other_linear_models.append(cv_result.mean())

In [14]:
print(f"Mean accuracy SGDClassifier: {cv_other_linear_models[0]}")
print(f"Mean accuracy LinearSVC: {cv_other_linear_models[1]}")

Mean accuracy SGDClassifier: 0.758
Mean accuracy LinearSVC: 0.7689999999999999


Решающие деревья

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [16]:
cv_forests = []
for forest in [RandomForestClassifier(), GradientBoostingClassifier()]:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          forest),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_forests.append(cv_result.mean())


In [17]:
print(f"Mean accuracy RFC: {cv_forests[0]}")
print(f"Mean accuracy XGB: {cv_forests[1]}")

Mean accuracy RFC: 0.7304999999999999
Mean accuracy XGB: 0.725


Как видно все модели по умолчанию выдают примерно одинаковое качесвто около 0.75
Возьмем одну из них, например, логистическую регрессиб и поработаем с признаковым пространством.

Ипользуем n_gramm

In [18]:
cv_result_ngram = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(1,3)), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [19]:
print(f"mean accuracy with ngram for 1 to 3: {cv_result_ngram.mean()}")

mean accuracy with ngram for 1 to 3: 0.7645


Буквенные n_gramm

In [20]:
cv_result_ngramchar = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(3,6), analyzer='char'), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [21]:
print(f"mean accuracy with ngram char for 1 to 3: {cv_result_ngramchar.mean()}")

mean accuracy with ngram char for 1 to 3: 0.771


Дополнительные преобразования после векторизации


In [22]:
def cls_pipeline_transform(vect, trans, cls):
  return Pipeline([
                   ('vectorizer', vect),
                   ('transformer', trans),
                   ('classifier', cls)
                  ])

In [23]:
vect_data = CountVectorizer().fit_transform(train_data)
vect_data.shape

(2000, 3973)

Всего получается около 4000 признаков попробуем провести понижение размерности признакового пространства.

In [24]:
cv_result_trans = cross_val_score(cls_pipeline_transform(CountVectorizer(),
                                                 TruncatedSVD(n_components=1000),
                                                 LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [25]:
cv_result_trans.mean()

0.769

Попробуем построение частотных n-грамм с tfidf преобразованием и LinearSVC

In [26]:
cv_result_countt = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_data, train_label, scoring='accuracy', cv=5)

In [27]:
cv_result_countt.mean()

0.79

Получилось наилучшее качество из всех рассмотренных вариантов

Далее будем использовать этот pipeline кроме отдельных случаев


Попробуем трюк с добавление частицы не в начало слова, след. функкция реализует это.

In [28]:
def add_neg(review):
  neg_chars = {"dont", "nt", "n't", "doesnt", "does'nt", "'t", "not", "no"}
  words = review.split(' ')
  new_review = []
  i = 0
  while i < len(words):
    if words[i] in neg_chars:
      new_review.append(words[i] + "_" + words[i+1] )
      i+= 2
    else:
      new_review.append(words[i])
      i+= 1
  return ' '.join(new_review)


In [29]:
train_neg = train_data.apply(add_neg)

In [30]:
cv_result_addneg = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_neg, train_label, scoring='accuracy', cv=5)

In [31]:
print(cv_result_addneg.mean())

0.7825


Представим текст в виде векторов word2vec


In [32]:
all_stopwords = gensim.parsing.preprocessing.STOPWORDS # стоп слова 

In [33]:
list_words = train_data.apply(lambda x: [word for word in x.split() if word not in all_stopwords]) #токенизация предложений и фильтрация по стоп словам
list_words

0                    [2, ., 10,000, 640x480, pictures, .]
1       [downloaded, trial, version, associates, ez, f...
2       [wrt54g, plus, hga7t, perfect, solution, need,...
3       [dont, especially, like, music, files, unstruc...
4       [cheapie, pail, ..., worked, ok, opening, devi...
                              ...                        
1995    [speaker, phone, quality, good, ,, poping, 512...
1996                        [", movies, ", 5, seconds, .]
1997                                   [overall, like, .]
1998    [began, taking, pics, soon, got, camera, amaze...
1999      [reading, instructions, ,, 's, hard, figure, .]
Name: text, Length: 2000, dtype: object

In [34]:
# построение и обучение модели векторизации слов
model_w2v = gensim.models.Word2Vec(size=1000, min_count=10)
model_w2v.build_vocab(list_words)
model_w2v.train(list_words,total_examples=model_w2v.corpus_count, epochs=1)

(7814, 20540)

In [35]:
# функция которая переводит весь текст в один вектор по векторам слов усредняя их
def creat_vect_text(model, words_list):
  all_vect = np.array([model.wv[word] for word in words_list if word in model.wv])
  return all_vect.mean(axis=0)

In [36]:
train_vect_data = [creat_vect_text(model_w2v, text) for text in list_words]
data_text_vect = pd.DataFrame(train_vect_data)
data_text_vect.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999
0,0.000391,0.00011,-0.001264,0.000268,-0.0016,0.001407,-0.001316,0.000354,0.00115,3e-05,-0.000219,3.4e-05,-0.000422,-0.000118,-0.000122,0.001001,-0.000471,-0.000318,0.000874,3.7e-05,-0.000478,0.00041,0.001212,-4.5e-05,0.001248,0.000828,-7.8e-05,6e-06,8.473467e-07,0.001938,0.002119,-0.001585,0.000483,1.8e-05,0.000124,0.001527,0.000712,-0.000475,-0.001329,7.1e-05,...,-0.001136,-0.000208,0.00025,-0.000264,-0.000143,1e-05,0.000397,0.000657,-0.000401,0.000479,0.000821,0.001037,-0.00079,-0.00028,0.000373,0.000167,-0.0007,0.000962,-0.000791,0.000211,-0.000404,0.000798,-0.000737,0.000958,-0.000952,0.000849,-0.000924,-0.0001627725,0.00062,9e-05,-7.5e-05,-0.001668,0.001115,0.00052,0.000794,0.000625,-2.1e-05,0.000665,-0.001099,-0.00027
1,0.000367,0.00019,-0.000832,2e-05,-0.000748,0.000763,-0.000625,0.000356,0.000549,-8.5e-05,-0.000202,-8.3e-05,-0.000228,-5.2e-05,-0.000167,0.000595,-5.8e-05,-7.3e-05,0.00058,0.000216,-0.00051,0.000217,0.000678,-0.00016,0.000598,0.000575,0.000166,-0.000312,0.0002544051,0.001178,0.001081,-0.00081,8.5e-05,-0.000105,3e-05,0.000807,0.000241,-0.000347,-0.000488,2.2e-05,...,-0.000568,-0.000167,0.000366,-6.7e-05,-0.000324,-0.000141,0.000184,0.000388,-0.000198,0.000213,0.000526,0.000615,-0.000544,-0.000124,8.1e-05,0.000141,-0.000495,0.000611,-0.000752,0.000124,-0.000109,0.000266,-0.000263,0.000597,-0.000476,0.000692,-0.000594,-0.000257886,0.000165,0.000306,0.000291,-0.00106,0.00069,0.000402,0.000603,0.000555,-5.9e-05,0.000467,-0.000725,-0.0003
2,0.000186,-6.7e-05,-0.000666,9e-05,-0.000775,0.000595,-0.000707,9.9e-05,0.000477,-1.5e-05,2.8e-05,3.9e-05,-0.000181,-0.000188,-0.000257,0.000613,-0.000269,-8.2e-05,0.000207,4.6e-05,-0.000208,0.00022,0.000522,-8.5e-05,0.000539,0.000554,-9e-06,-0.000132,9.989855e-05,0.000925,0.001107,-0.000704,0.00014,-0.000131,-7.7e-05,0.000809,0.000173,-6.9e-05,-0.000494,0.00012,...,-0.000559,-1.8e-05,0.000207,-8.1e-05,-0.000237,0.000107,6.4e-05,0.000339,-0.000444,0.000136,0.00038,0.000359,-0.000547,-0.000169,-0.000105,6.5e-05,-0.000227,0.000464,-0.000414,0.000137,-0.000449,0.000209,-0.00016,0.000595,-0.00061,0.00028,-0.000248,3.672254e-05,0.000303,0.000282,0.00015,-0.000712,0.000495,0.000227,0.000246,0.000246,-1.8e-05,0.00025,-0.000548,-0.000202
3,0.000399,0.000112,-0.001106,0.000239,-0.001239,0.001154,-0.000742,0.000289,0.001197,0.000204,-0.000164,0.000103,-0.000277,6.8e-05,-0.000101,0.000878,-0.000154,-0.000169,0.000642,0.000249,-0.000429,0.000628,0.000897,7e-06,0.00109,0.000836,-0.000118,3.7e-05,0.0001134719,0.001561,0.001941,-0.001316,0.000386,-0.00031,3e-06,0.001255,0.000184,-0.000271,-0.000687,0.000235,...,-0.001059,-0.00025,-5.2e-05,0.0001,-0.000193,-0.0001,0.000364,0.000264,-0.000323,0.000181,0.000569,0.00073,-0.001092,-0.000539,0.000219,0.000123,-0.000769,0.000557,-0.000913,1.8e-05,-0.00045,0.000461,-0.000519,0.000827,-0.00076,0.00038,-0.000922,-0.0003744143,0.000568,0.00025,0.000224,-0.001494,0.00105,0.000638,0.000452,0.000449,-7.2e-05,0.000499,-0.000846,-0.000108
4,0.000219,-4.6e-05,-0.000538,0.000136,-0.000641,0.000543,-0.000653,0.00018,0.000443,0.000151,5.4e-05,5.4e-05,-0.000219,0.000257,3.5e-05,0.000468,-0.000155,-0.000132,0.000279,0.000242,-0.000235,0.000268,0.000604,8.7e-05,0.000611,0.000551,-0.000118,-0.000102,-9.673397e-05,0.000691,0.000966,-0.000664,0.00028,9.1e-05,2e-05,0.000775,0.000183,-0.000441,-0.000463,1e-05,...,-0.000514,-0.000213,8.3e-05,-5.7e-05,-0.000287,-3.9e-05,0.000327,0.000216,-0.000247,2.7e-05,0.000295,0.000404,-0.000482,-0.000242,0.000104,5.5e-05,-0.000419,0.000585,-0.000529,6.4e-05,-0.000214,0.000112,-0.000382,0.000324,-0.000398,0.000368,-0.000549,-6.035989e-07,0.000294,0.000298,0.000187,-0.000721,0.000539,0.000445,0.000339,0.000361,-7.5e-05,0.000221,-0.000541,-1e-05


In [37]:
cv_result_vect_lr = cross_val_score(LogisticRegression(),data_text_vect, train_label, scoring='accuracy', cv=5)

In [38]:
cv_result_vect_lr.mean()

0.6369999999999999

Балансировка классов будстрапом

In [39]:
from imblearn.over_sampling import RandomOverSampler
X = train_data.to_numpy().reshape(-1,1)
X_b, y_b = RandomOverSampler().fit_sample(X, train_label)
print(np.sum(y_b==1))
print(np.sum(y_b==0))

1274
1274




In [40]:
cv_result_balance = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          X_b.reshape(1,X_b.shape[0])[0], y_b, scoring='accuracy', cv=5)

cv_result_balance.mean()

0.8791417234870373

Лучший вариант с качеством выше 0.8. Обучим pipeline на всех данных и посмотрим на качество тестовых данных


In [41]:
model_cls = cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                   TfidfTransformer(),
                                   LinearSVC())
model_cls.fit(X_b.reshape(1,X_b.shape[0])[0], y_b)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
  

In [42]:
print(f"Mean accuracy: {metrics.accuracy_score(y_b, model_cls.predict(X_b.reshape(1,X_b.shape[0])[0]))}")

Mean accuracy: 1.0


Прогноз на тестовой выборке




In [43]:
predicted = model_cls.predict(data_test['text'])
data_id['y'] = predicted
data_id.head()

Unnamed: 0_level_0,y
Id,Unnamed: 1_level_1
0,1
1,0
2,1
3,1
4,0


In [44]:
data_id.to_csv('sample_sabmission.csv')