# Embeddings with word2vec

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd drive/My\ Drive/textaug/notebooks/Word2Vec

/content/drive/My Drive/textaug/notebooks/Word2Vec


In [0]:
import numpy as np
import pandas as pd
import re

In [0]:
df_pos = pd.read_csv('../../data/twitts/positive.csv', sep=';', header=None).assign(positive=1)
df_neg = pd.read_csv('../../data/twitts/negative.csv', sep=';', header=None).assign(positive=0)
tweets = df_pos[[3,'positive']].append( df_neg[[3,'positive']], ignore_index=True ).rename({3:'tweet'}, axis=1)

In [0]:
tweets.tweet = tweets.tweet.apply(lambda x: re.sub('[\Wa-zA-Z_\d]+', ' ', x) )\
                           .apply(lambda x: re.sub('\s+', ' ', x) )\
                           .apply(lambda x: x.lower().strip() )

In [8]:
tweets.head(5)

Unnamed: 0,tweet,positive
0,хоть я и школота но поверь у нас то же самое о...,1
1,да все таки он немного похож на него но мой ма...,1
2,ну ты идиотка я испугалась за тебя,1
3,кто то в углу сидит и погибает от голода а мы ...,1
4,вот что значит страшилка но блин посмотрев все...,1


In [0]:
X = tweets.tweet
y = tweets.positive

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Augmentation part

In [0]:
!pip install --upgrade gensim -q
!pip install annoy -q

In [0]:
from gensim.test.utils import datapath
from gensim import utils
import gensim.models

In [0]:
#если файл с моделью не скачан, то
# !wget -O mipt_vecs.w2v http://panchenko.me/data/dsl-backup/w2v-ru/all.norm-sz100-w10-cb0-it1-min100.w2v

In [0]:
mipt_model = gensim.models.KeyedVectors.load_word2vec_format('mipt_vecs.w2v', binary=True, unicode_errors='ignore')

In [14]:
from gensim.similarities.index import AnnoyIndexer #вопрос, может ли работать
annoy_index = AnnoyIndexer(mipt_model, num_trees=10)

  index = AnnoyIndex(num_features)


In [0]:
model = mipt_model
import random
def sen_aug_all_changes_annoy(sen):
    words = sen.split(' ')
    for i in range(len(words)):
        if words[i] in model:
            words[i] = model.most_similar(words[i], topn=2, indexer=annoy_index)[1][0]
    return ' '.join(words)

def sen_aug_one_change_annoy(sen):
    words = sen.split(' ')
    indexes = list( range(len(words)) )
    random.shuffle(indexes)
    for i in indexes:
        if words[i] in model:
            words[i] = model.most_similar(words[i], topn=2, indexer=annoy_index)[1][0]
            return ' '.join(words)
    return sen  # no replacement was possible

In [0]:
def augment_w2v(X_train, y_train):
  X_train_aug = X_train.apply(lambda x: sen_aug_all_changes_annoy(x) )
  X_train_aug = X_train_aug.append(X_train, ignore_index=True)
  y_train_aug = y_train.append(y_train, ignore_index=True)
  return (X_train_aug, y_train_aug)

In [0]:
X_train_aug, y_train_aug = augment_w2v(X_train, y_train)

In [0]:
X_train_aug = X_train_aug.str.replace(r'[\Wa-zA-Z_\d]+', ' ')

In [47]:
X_train.shape

(170125,)

## Sentence vectorization



In [0]:
n = mipt_model.vector_size
def sen_to_vec(sentence):
  vec = np.zeros(n)
  words = sentence.split(' ')
  for w in words:
    vec += (mipt_model[w] if w in mipt_model else np.zeros(n))
  return vec/len(words) if len(words)>0 else np.zeros(n)

In [34]:
%timeit sen_to_vec('я пришёл в дом')

The slowest run took 87.16 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 16.1 µs per loop


In [0]:
def X_embed(X_train):
  X_train = X_train.apply(lambda x: sen_to_vec(x))
  return pd.DataFrame(X_train.tolist())

Сравним результаты аугментированного набора (`X_train_aug`) и неаугментированного (`X_train`)

In [0]:
X_train_aug_w2v = X_embed(X_train_aug)
X_train_w2v = X_embed(X_train)
X_test_w2v = X_embed(X_test)

In [46]:
print('Augmeted set size', len(X_train_aug_w2v) )
print('Original set size', len(X_train_w2v) )
print('Test set size', len(X_test_w2v) )

print('Check 1:', 2*len(X_train_w2v)==len(X_train_aug_w2v) )

Augmeted set size 340250
Original set size 170125
Test set size 56709
Check 1: True


In [48]:
X_train_aug_w2v.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.054382,0.137205,-0.095994,-0.013288,-0.041406,0.31605,0.050567,0.05492,-0.208947,0.225267,0.326322,0.012957,-0.079552,-0.169117,-0.012694,-0.097051,0.152679,-0.127781,0.266471,-0.126551,0.105117,-0.110404,0.350891,-0.155198,-0.17544,-0.002162,0.062627,0.210187,-0.041145,0.20087,-0.018458,-0.023818,-0.029906,0.051719,-0.050898,-0.18429,0.090957,-0.273502,-0.127483,0.102612,...,-0.176105,0.111134,-0.051869,-0.294914,0.159276,0.119495,-0.168799,0.015283,-0.023362,0.125394,-0.456479,0.146428,-0.113375,-0.008224,-0.269178,0.130128,0.366519,-0.475991,0.026033,-0.081269,-0.102639,0.020546,-0.005222,-0.267833,-0.043674,-0.012851,-0.184007,0.203215,-0.354229,-0.030137,-0.051663,-0.144443,-0.123709,0.066022,0.006383,-0.045658,-0.130894,0.094428,0.065843,-0.043937
1,0.015855,0.063667,-0.127643,0.003043,0.001961,0.133128,0.032458,0.123032,-0.122892,0.325165,0.232079,0.133627,-0.152434,-0.336229,0.136132,-0.182791,0.098464,-0.09246,-0.023784,0.092582,0.176943,0.062776,0.130953,-0.239896,-0.136592,0.000762,0.01895,0.085557,-0.016827,0.28333,0.015452,0.027671,-0.014082,-0.006724,0.154101,-0.089213,0.27502,-0.091069,-0.061771,0.005584,...,-0.060967,0.207508,-0.100097,-0.132338,0.194298,0.18907,-0.186489,0.141448,0.033049,0.1679,-0.383343,0.145207,-0.292,-0.053738,-0.139198,-0.046173,0.231929,-0.428968,0.112941,-0.164064,0.177837,0.038983,-0.034802,-0.261586,-0.176205,0.251477,-0.189818,0.212188,-0.334171,-0.142482,-0.169157,-0.021112,-0.256351,-0.109043,-0.272024,0.036886,-0.051598,0.312954,0.121376,-0.094094
2,-0.139771,-0.034629,-0.242599,0.085468,0.028503,0.133812,-0.009247,-0.040932,-0.403582,-0.227077,0.138229,0.076506,-0.044473,-0.276694,0.202749,0.025476,0.071257,0.165876,0.405992,-0.159416,0.246751,0.014865,0.4512,0.061227,0.284285,-0.026132,0.127577,-0.009157,-0.067277,0.436827,-0.112517,-0.03741,0.012913,0.139174,-0.025233,0.132042,0.141451,-0.179879,-0.050161,0.006221,...,-0.133779,0.290632,0.089101,0.102,0.174014,0.22519,-0.023794,-0.27903,-0.069197,-0.013639,-0.131231,0.356143,-0.351375,0.178171,-0.190211,-0.11803,0.452199,-0.25953,0.12079,-0.126241,0.097388,0.401848,0.066724,-0.279208,-0.215351,0.085668,-0.162523,0.121388,-0.348972,0.013811,0.133845,-0.207767,-0.216514,0.207942,-0.21666,-0.081285,-0.231968,0.188551,-0.186161,0.019781
3,-0.214696,-0.126272,-0.046008,-0.037966,-0.025587,0.171238,-0.078571,0.035044,-0.051151,0.006033,0.230141,0.146917,0.023249,-0.185369,0.135464,0.019819,0.208784,0.134188,0.163393,-0.110884,0.268554,-0.124817,0.32839,-0.080765,-0.043472,0.034561,-0.055825,0.119039,0.116862,0.263969,-0.080987,-0.003776,0.061436,0.110685,-0.149986,0.063042,-0.017059,-0.457316,0.035171,0.024352,...,-0.068439,0.1557,0.180127,-0.074007,0.106125,0.083232,-0.261495,-0.053703,0.054144,0.057652,-0.372415,0.279088,-0.136056,-0.069525,-0.059865,-0.016758,0.376248,-0.343518,0.18714,0.219307,0.104605,0.15777,-0.104408,-0.135363,-0.253176,0.257309,-0.171109,0.366355,-0.190735,0.112855,-0.104014,-0.160146,-0.104787,0.235695,-0.133469,-0.100217,-0.151043,0.067397,0.117738,-0.154609
4,-0.018042,-0.077487,-0.109702,0.007798,0.128635,0.225823,0.02496,-0.008666,0.016691,0.130218,0.239337,0.171117,-0.04048,-0.227639,0.084034,-0.206949,0.174016,0.006328,0.209202,-0.145578,0.142494,0.026721,0.203722,-0.11586,-0.188641,0.059174,-0.005062,0.156399,-0.056875,0.324173,-0.231026,-0.005655,0.005738,0.009941,-0.03948,-0.018487,0.009659,-0.263334,0.005425,0.146189,...,-0.084726,0.133946,0.012461,0.013319,0.04587,0.096941,-0.13531,0.023395,0.029517,0.155049,-0.253747,0.269839,-0.12321,0.066026,0.021313,-0.111989,0.138128,-0.25376,0.157458,-0.08653,0.037991,0.027645,0.00598,-0.295395,-0.203334,0.218928,-0.151598,0.259333,-0.26416,0.02703,-0.014929,-0.052527,0.023353,0.030499,-0.163133,-0.011232,-0.161799,0.215018,-0.063733,-0.19714


## Го учиться

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

Original set

In [55]:
clf_origin = LogisticRegression(max_iter=400)
clf_origin.fit(X_train_w2v, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=400,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred = clf_origin.predict(X_test_w2v)

In [59]:
accuracy_score(y_pred, y_test)

0.6718862967077536

In [0]:
def bootstrap_toy(y_pred, y_test, num=1000 ):
  answers = (y_pred == y_test.values)
  samples_num = len(answers)
  means = np.array([np.mean( answers[ np.random.randint(len(answers), size=samples_num) ] ) for _ in range(num)])
  print(f'Mean: {means.mean():.3f} +/- {means.std()*2:.3f} (95% conf.)')

In [62]:
bootstrap_toy(y_pred, y_test)

Mean: 0.672 +/- 0.004 (95% conf.)


Augmented set

In [64]:
clf_aug = LogisticRegression(max_iter=400)
clf_aug.fit(X_train_aug_w2v, y_train_aug)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=400,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred = clf_aug.predict(X_test_w2v)

In [70]:
accuracy_score(y_pred, y_test)

0.6678481369800208

In [71]:
bootstrap_toy(y_pred, y_test)

Mean: 0.668 +/- 0.004 (95% conf.)


???

In [105]:
X_train_aug_w2v.shape

(340250, 100)

In [106]:
X_train_w2v.shape

(170125, 100)