# word2vec
It's a shallow, two-layer neural network that accepts a text corpus as an input, and it returns a set of vectors (also known as embeddings); each vector is a numeric representation of a given word

In [1]:
import gensim.downloader as api
wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [2]:
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [3]:
wiki_embeddings.most_similar('king')

[(&#39;prince&#39;, 0.7682329416275024),
 (&#39;queen&#39;, 0.7507690191268921),
 (&#39;son&#39;, 0.7020887136459351),
 (&#39;brother&#39;, 0.6985775232315063),
 (&#39;monarch&#39;, 0.6977890729904175),
 (&#39;throne&#39;, 0.6919990181922913),
 (&#39;kingdom&#39;, 0.6811410188674927),
 (&#39;father&#39;, 0.680202841758728),
 (&#39;emperor&#39;, 0.6712858080863953),
 (&#39;ii&#39;, 0.6676074266433716)]

In [4]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
messages.columns = ['label', 'text']
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [5]:
messages['clean_text'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [6]:
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'], messages['label'], test_size=0.2)

In [7]:
w2v_model = gensim.models.Word2Vec(X_train, size=100, window=5, min_count=2)

In [8]:
w2v_model.wv.most_similar('king')

[(&#39;watching&#39;, 0.9976290464401245),
 (&#39;didn&#39;, 0.9974445700645447),
 (&#39;quite&#39;, 0.9974321126937866),
 (&#39;put&#39;, 0.9974194765090942),
 (&#39;lor&#39;, 0.9974073767662048),
 (&#39;ok&#39;, 0.9974033236503601),
 (&#39;dad&#39;, 0.9974023699760437),
 (&#39;probably&#39;, 0.9973987340927124),
 (&#39;ì_&#39;, 0.9973981380462646),
 (&#39;ever&#39;, 0.997397780418396)]

In [9]:
print(w2v_model.wv.index2word[0:500:50])

[&#39;to&#39;, &#39;then&#39;, &#39;week&#39;, &#39;im&#39;, &#39;wait&#39;, &#39;live&#39;, &#39;house&#39;, &#39;ill&#39;, &#39;plz&#39;, &#39;working&#39;]


In [10]:
# Every word in message list of words X_test is returned as the vector learned only if such word is has been learned
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word]) for ls in X_test])

In [15]:
for i, v in enumerate(w2v_vect):
    if(i<5):
        print(len(X_test.iloc[i]), len(v))

14 12
25 21
12 12
23 22
9 9


In [31]:
w2v_vect_avg = []
for vect in w2v_vect:
    if(len(vect)!=0):
        w2v_vect_avg.append(vect.mean(axis=0)) # Don't understand why the length will became 100!!!
    else:
        w2v_vect_avg.append(np.zeros(100))

In [32]:
for i, v in enumerate(w2v_vect_avg):
    if(i<5):
        print(len(X_test.iloc[i]), len(v))

14 100
25 100
12 100
23 100
9 100
