In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.neural_network import MLPClassifier
import gensim
import numpy as np
import pickle
!pip install imblearn




Выгружаем данные

In [2]:
data_train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'Id'])
data_test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col='Id')
data_id = pd.read_csv('products_sentiment_sample_submission.csv', sep=',', index_col='Id')


In [3]:
data_train.groupby(by='Id')['text'].count()

Id
0     726
1    1274
Name: text, dtype: int64

In [4]:
def cls_pipeline(vect, cls):
    return Pipeline([
                   ('vectorizer', vect),
                   ('classifier', cls)])

In [3]:
train_data = data_train['text']
train_label = data_train['Id']

Baseline признаки на частотах слов и логистическая регрессия

In [6]:
cv_result_bl = cross_val_score(cls_pipeline(CountVectorizer(), LogisticRegression(class_weight='balanced')),train_data, train_label, scoring='accuracy', cv=5)

In [7]:
print(f"Mean accuracy baseline: {cv_result_bl.mean()}")

Mean accuracy baseline: 0.7645000000000001


Удалим стоп слова 

In [8]:
cv_result_sw = cross_val_score(cls_pipeline(CountVectorizer(stop_words='english'), 
                                            LogisticRegression()),
                                            train_data, train_label, scoring='accuracy', cv=5)

In [9]:
print(f"Mean accuracy without stop words: {cv_result_sw.mean()}")

Mean accuracy without stop words: 0.748


Качество не изменилось

Используем tfidf

In [10]:
cv_result_tfidf = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                              LogisticRegression()),
                                              train_data, train_label, scoring='accuracy', cv=5)

In [11]:
print(f"Mean accuracy tfidf :{cv_result_tfidf.mean()}")

Mean accuracy tfidf :0.766


Попробуем использовать LinearSVC, SGDClassifier 

In [12]:
linear_model_list = [SGDClassifier(), LinearSVC()]

In [13]:
cv_other_linear_models = []
for model in linear_model_list:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          model),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_other_linear_models.append(cv_result.mean())

In [14]:
print(f"Mean accuracy SGDClassifier: {cv_other_linear_models[0]}")
print(f"Mean accuracy LinearSVC: {cv_other_linear_models[1]}")

Mean accuracy SGDClassifier: 0.7550000000000001
Mean accuracy LinearSVC: 0.7689999999999999


Решающие деревья

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [16]:
cv_forests = []
for forest in [RandomForestClassifier(), GradientBoostingClassifier()]:
  cv_result = cross_val_score(cls_pipeline(TfidfVectorizer(), 
                                          forest),
                                          train_data, train_label, scoring='accuracy', cv=5)
  cv_forests.append(cv_result.mean())


In [17]:
print(f"Mean accuracy RFC: {cv_forests[0]}")
print(f"Mean accuracy XGB: {cv_forests[1]}")

Mean accuracy RFC: 0.734
Mean accuracy XGB: 0.7300000000000001


Как видно все модели по умолчанию выдают примерно одинаковое качесвто около 0.75
Возьмем одну из них, например, логистическую регрессиб и поработаем с признаковым пространством.

Ипользуем n_gramm

In [18]:
cv_result_ngram = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(1,3)), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [19]:
print(f"mean accuracy with ngram for 1 to 3: {cv_result_ngram.mean()}")

mean accuracy with ngram for 1 to 3: 0.7645


Буквенные n_gramm

In [20]:
cv_result_ngramchar = cross_val_score(cls_pipeline(CountVectorizer(ngram_range=(3,6), analyzer='char'), 
                                            LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [21]:
print(f"mean accuracy with ngram char for 1 to 3: {cv_result_ngramchar.mean()}")

mean accuracy with ngram char for 1 to 3: 0.771


Дополнительные преобразования после векторизации


In [7]:
def cls_pipeline_transform(vect, trans, cls):
  return Pipeline([
                   ('vectorizer', vect),
                   ('transformer', trans),
                   ('classifier', cls)
                  ])

In [23]:
vect_data = CountVectorizer().fit_transform(train_data)
vect_data.shape

(2000, 3973)

Всего получается около 4000 признаков попробуем провести понижение размерности признакового пространства.

In [24]:
cv_result_trans = cross_val_score(cls_pipeline_transform(CountVectorizer(),
                                                 TruncatedSVD(n_components=1000),
                                                 LogisticRegression()),
                               train_data, train_label, scoring='accuracy', cv=5)

In [25]:
cv_result_trans.mean()

0.7665

Попробуем построение частотных n-грамм с tfidf преобразованием и LinearSVC

In [26]:
cv_result_countt = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_data, train_label, scoring='accuracy', cv=5)

In [27]:
cv_result_countt.mean()

0.79

Получилось наилучшее качество из всех рассмотренных вариантов

Далее будем использовать этот pipeline кроме отдельных случаев


Попробуем трюк с добавление частицы не в начало слова, след. функкция реализует это.

In [28]:
def add_neg(review):
  neg_chars = {"dont", "nt", "n't", "doesnt", "does'nt", "'t", "not", "no"}
  words = review.split(' ')
  new_review = []
  i = 0
  while i < len(words):
    if words[i] in neg_chars:
      new_review.append(words[i] + "_" + words[i+1] )
      i+= 2
    else:
      new_review.append(words[i])
      i+= 1
  return ' '.join(new_review)


In [29]:
train_neg = train_data.apply(add_neg)

In [30]:
cv_result_addneg = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          train_neg, train_label, scoring='accuracy', cv=5)

In [31]:
print(cv_result_addneg.mean())

0.7825


Представим текст в виде векторов word2vec


In [32]:
all_stopwords = gensim.parsing.preprocessing.STOPWORDS # стоп слова 

In [33]:
list_words = train_data.apply(lambda x: [word for word in x.split() if word not in all_stopwords]) #токенизация предложений и фильтрация по стоп словам
list_words

0                    [2, ., 10,000, 640x480, pictures, .]
1       [downloaded, trial, version, associates, ez, f...
2       [wrt54g, plus, hga7t, perfect, solution, need,...
3       [dont, especially, like, music, files, unstruc...
4       [cheapie, pail, ..., worked, ok, opening, devi...
                              ...                        
1995    [speaker, phone, quality, good, ,, poping, 512...
1996                        [", movies, ", 5, seconds, .]
1997                                   [overall, like, .]
1998    [began, taking, pics, soon, got, camera, amaze...
1999      [reading, instructions, ,, 's, hard, figure, .]
Name: text, Length: 2000, dtype: object

In [34]:
# построение и обучение модели векторизации слов
model_w2v = gensim.models.Word2Vec(size=1000, min_count=10)
model_w2v.build_vocab(list_words)
model_w2v.train(list_words,total_examples=model_w2v.corpus_count, epochs=1)

(7814, 20540)

In [35]:
# функция которая переводит весь текст в один вектор по векторам слов усредняя их
def creat_vect_text(model, words_list):
  all_vect = np.array([model.wv[word] for word in words_list if word in model.wv])
  return all_vect.mean(axis=0)

In [36]:
train_vect_data = [creat_vect_text(model_w2v, text) for text in list_words]
data_text_vect = pd.DataFrame(train_vect_data)
data_text_vect.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-0.000134,0.000442,-0.001102,-0.000732,-0.000536,-0.00137,0.000627,-0.001292,-0.000213,0.000704,...,-0.000103,0.001355,-0.000457,-0.000328,-0.00026,0.000199,-0.000713,-0.00043,-0.001315,4.3e-05
1,1e-05,0.000281,-0.000536,-0.00042,-0.000404,-0.000602,0.000495,-0.000542,-8.1e-05,0.00023,...,-7e-06,0.000682,-0.000361,4.2e-05,-0.000279,8.3e-05,-0.000382,-0.000305,-0.000483,1.3e-05
2,-0.000203,0.000464,-0.000541,-0.000588,-0.000259,-0.000636,0.000168,-0.000608,-3.9e-05,0.000461,...,-0.000113,0.000701,-0.000149,1.7e-05,-0.000177,0.000132,-0.000343,-0.00036,-0.000506,0.00011
3,0.000206,0.000583,-0.000734,-0.000745,-0.000608,-0.000939,0.00077,-0.001223,-0.00013,0.000601,...,-0.000289,0.000894,-0.000271,8.6e-05,-0.000523,0.000245,-0.000573,-0.00056,-0.001123,8e-06
4,-1e-05,0.000391,-0.000517,-0.000371,-0.000464,-0.000615,0.000151,-0.000683,-0.000145,0.000298,...,-3.1e-05,0.000697,-0.000176,-0.000294,-0.000227,-0.000109,-0.000431,-0.000139,-0.000506,0.000224


In [37]:
cv_result_vect_lr = cross_val_score(LogisticRegression(),data_text_vect, train_label, scoring='accuracy', cv=5)

In [38]:
cv_result_vect_lr.mean()

0.6369999999999999

Балансировка классов будстрапом

In [5]:
from imblearn.over_sampling import RandomOverSampler
X = train_data.to_numpy().reshape(-1,1)
print(X)
X_b, y_b = RandomOverSampler().fit_resample(X, train_label)
print(np.sum(y_b==1))
print(np.sum(y_b==0))

[['2 . take around 10,000 640x480 pictures .']
 ['i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .']
 ['the wrt54g plus the hga7t is a perfect solution if you need wireless coverage in a wider area or for a hard-walled house as was my case .']
 ...
 ['overall i like it . ']
 ['i began taking pics as soon as i got this camera and am amazed at the quality of photos i have took simply by using the auto mode . ']
 ["even after reading some of the instructions , it 's still hard to figure out . "]]
1274
1274


In [8]:
cv_result_balance = cross_val_score(cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                                          TfidfTransformer(),
                                                          LinearSVC()),
                                                          X_b.reshape(1,X_b.shape[0])[0], y_b, scoring='accuracy', cv=5)

cv_result_balance.mean()

0.868555799530028

Лучший вариант с качеством выше 0.8. Обучим pipeline на всех данных и посмотрим на качество тестовых данных


In [41]:
model_cls = cls_pipeline_transform(CountVectorizer(ngram_range=(1,3)),
                                   TfidfTransformer(),
                                   LinearSVC())
model_cls.fit(X_b.reshape(1,X_b.shape[0])[0], y_b)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 3))),
                ('transformer', TfidfTransformer()),
                ('classifier', LinearSVC())])

In [42]:
print(f"Mean accuracy: {metrics.accuracy_score(y_b, model_cls.predict(X_b.reshape(1,X_b.shape[0])[0]))}")

Mean accuracy: 1.0


Прогноз на тестовой выборке




In [43]:
predicted = model_cls.predict(data_test['text'])
data_id['y'] = predicted
data_id.head()

Unnamed: 0_level_0,y
Id,Unnamed: 1_level_1
0,1
1,0
2,1
3,1
4,0


In [44]:
data_id.to_csv('sample_sabmission.csv')

Сохранение модели

In [45]:
vectorizer = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                      ('trans', TfidfTransformer())])

In [46]:
vectorizer.fit(X_b.reshape(1,X_b.shape[0])[0])

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 3))),
                ('trans', TfidfTransformer())])

In [47]:
transform_features = vectorizer.transform(X_b.reshape(1,X_b.shape[0])[0])

In [48]:
model = LogisticRegression().fit(transform_features, y_b)
print(f"Accuracy: {metrics.accuracy_score(y_b, model.predict(transform_features))}")

Accuracy: 0.9945054945054945


In [51]:
with open (r"simple_demo/vectorizer.pkl", "w+b") as f:
    pickle.dump(vectorizer, f)

with open (r"simple_demo/model_cls.pkl", "w+b") as f:
    pickle.dump(model, f)

In [114]:
vect = vectorizer.transform(["Это отличный банк, просто чудесный"])
vect

<1x48615 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [115]:
model.predict_proba(vect)

array([[0.46584354, 0.53415646]])