# word2vec: How To Implement word2vec

### Explore Pre-trained Embeddings

Some other options:
- `glove-twitter-{25/50/100/200}`
- `glove-wiki-gigaword-{50/200/300}`
- `word2vec-google-news-300`
- `word2vec-ruscorpora-news-300`

In [1]:
# Install gensim
!pip install -U gensim

Requirement already up-to-date: gensim in c:\users\archi\anaconda3\lib\site-packages (3.8.3)


You should consider upgrading via the 'c:\users\archi\anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
# Load pretrained word vectors using gensim
import gensim.downloader as api

wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [3]:
# Explore the word vector for "king"
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [4]:
# Find the words most similar to king based on the trained word vectors
wiki_embeddings.most_similar('king')

[('prince', 0.7682329416275024),
 ('queen', 0.7507690191268921),
 ('son', 0.7020887136459351),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175),
 ('throne', 0.6919990181922913),
 ('kingdom', 0.6811410188674927),
 ('father', 0.6802029013633728),
 ('emperor', 0.6712857484817505),
 ('ii', 0.6676074266433716)]

### Train Our Own Model

In [5]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [6]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

In [8]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [9]:
# Explore the word vector for "king" base on our trained model
w2v_model.wv['king']

array([-0.01143144, -0.04015462,  0.04308384, -0.03775743, -0.06211508,
       -0.05555646, -0.10239975,  0.02284142,  0.0696946 , -0.00443605,
       -0.0740453 ,  0.03986885,  0.00818436,  0.11965823, -0.02451775,
       -0.12746537,  0.09749493,  0.1402857 ,  0.03513321,  0.05450424,
        0.09904657, -0.05291641,  0.1813988 , -0.01301383, -0.03586939,
       -0.12604521, -0.07296988,  0.06289582,  0.00608337, -0.00747485,
       -0.01584062,  0.04860228,  0.09789248,  0.08013952, -0.01947483,
       -0.03901044,  0.02100712,  0.02659484, -0.04629436,  0.08243837,
       -0.00642518, -0.07426196,  0.10372696, -0.00329005,  0.02359793,
        0.04137665,  0.0867987 ,  0.01835357, -0.05258048, -0.04628946,
       -0.01936685,  0.04456032, -0.0342486 , -0.02140936,  0.05341113,
       -0.03789661,  0.04132101, -0.11633381, -0.18068919,  0.03370852,
        0.0214502 ,  0.06464013,  0.01136789,  0.03267915,  0.0638374 ,
        0.10388783,  0.01967398, -0.04917517,  0.05304422,  0.07

In [10]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('haf', 0.9991196393966675),
 ('or', 0.9991163611412048),
 ('back', 0.9990954399108887),
 ('told', 0.9990910291671753),
 ('more', 0.9990885257720947),
 ('call', 0.9990877509117126),
 ('im', 0.9990752935409546),
 ('half', 0.9990749359130859),
 ('face', 0.9990745782852173),
 ('work', 0.9990737438201904)]

# word2vec: How To Prep Word Vectors For Modeling

In [11]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index2word

['to',
 'you',
 'the',
 'and',
 'in',
 'is',
 'me',
 'it',
 'my',
 'for',
 'your',
 'of',
 'call',
 'that',
 'have',
 'on',
 'are',
 'so',
 'now',
 'can',
 'but',
 'we',
 'not',
 'or',
 'get',
 'do',
 'will',
 'at',
 'if',
 'ur',
 'be',
 'with',
 'just',
 'no',
 'this',
 'up',
 'how',
 'gt',
 'when',
 'ok',
 'lt',
 'go',
 'from',
 'what',
 'free',
 'all',
 'out',
 'll',
 'know',
 'got',
 'good',
 'then',
 'like',
 'day',
 'was',
 'its',
 'am',
 'he',
 'there',
 'come',
 'time',
 'only',
 'send',
 'love',
 'want',
 'text',
 'one',
 'as',
 'txt',
 'need',
 'about',
 'by',
 'going',
 'see',
 'home',
 'don',
 'back',
 'lor',
 'today',
 'stop',
 'she',
 'sorry',
 'reply',
 'da',
 'our',
 'hi',
 'take',
 'still',
 'new',
 'later',
 'they',
 'her',
 'mobile',
 'dont',
 'pls',
 'any',
 'been',
 'has',
 'some',
 'please',
 'think',
 'phone',
 'an',
 'tell',
 'here',
 'much',
 'dear',
 'ì_',
 'msg',
 'week',
 'did',
 'who',
 'great',
 'too',
 'where',
 'hope',
 'well',
 'night',
 're',
 'oh',
 '

In [12]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word])
                     for ls in X_test])

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

1 1
152 144
5 5
5 4
23 19
6 5
11 10
25 25
8 7
16 12
13 13
28 23
2 2
13 12
6 6
4 4
8 6
21 19
23 21
10 10
13 12
12 12
8 7
17 16
29 29
10 10
20 20
5 5
20 15
5 5
18 18
21 18
17 13
15 15
5 5
8 7
30 28
14 13
14 12
12 12
26 25
18 17
23 19
33 25
34 30
12 12
22 22
5 3
31 29
14 12
20 19
1 1
13 13
3 3
14 14
16 15
7 6
12 12
12 10
23 23
9 9
8 8
12 10
5 5
11 11
21 21
7 7
22 22
9 5
11 11
27 27
22 20
17 14
23 22
7 5
4 4
7 6
6 6
32 32
19 19
4 4
9 9
24 24
9 5
8 8
5 5
10 10
14 11
4 4
10 8
21 21
22 19
9 9
11 11
6 6
22 18
7 6
21 21
11 10
10 8
24 17
10 10
4 3
3 3
9 8
6 6
15 13
7 2
31 31
8 7
12 12
28 18
5 4
7 5
2 2
16 13
24 21
10 10
8 8
5 2
8 7
15 14
29 28
14 12
21 14
19 19
11 11
24 22
2 2
67 66
8 6
26 26
11 11
1 1
15 12
28 25
7 7
4 4
16 16
27 27
3 2
11 11
5 5
4 3
4 2
9 7
16 14
5 5
17 17
30 28
23 21
17 17
6 6
31 31
1 1
10 9
41 34
24 23
10 10
19 15
6 5
13 12
23 22
21 19
3 3
19 19
14 13
6 5
13 12
12 11
2 2
15 13
28 25
6 6
9 9
12 11
10 6
18 17
101 101
9 1
21 20
31 31
15 13
19 16
23 20
26 26
13 13
30 29
8 7
1 1


In [14]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [15]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

1 100
152 100
5 100
5 100
23 100
6 100
11 100
25 100
8 100
16 100
13 100
28 100
2 100
13 100
6 100
4 100
8 100
21 100
23 100
10 100
13 100
12 100
8 100
17 100
29 100
10 100
20 100
5 100
20 100
5 100
18 100
21 100
17 100
15 100
5 100
8 100
30 100
14 100
14 100
12 100
26 100
18 100
23 100
33 100
34 100
12 100
22 100
5 100
31 100
14 100
20 100
1 100
13 100
3 100
14 100
16 100
7 100
12 100
12 100
23 100
9 100
8 100
12 100
5 100
11 100
21 100
7 100
22 100
9 100
11 100
27 100
22 100
17 100
23 100
7 100
4 100
7 100
6 100
32 100
19 100
4 100
9 100
24 100
9 100
8 100
5 100
10 100
14 100
4 100
10 100
21 100
22 100
9 100
11 100
6 100
22 100
7 100
21 100
11 100
10 100
24 100
10 100
4 100
3 100
9 100
6 100
15 100
7 100
31 100
8 100
12 100
28 100
5 100
7 100
2 100
16 100
24 100
10 100
8 100
5 100
8 100
15 100
29 100
14 100
21 100
19 100
11 100
24 100
2 100
67 100
8 100
26 100
11 100
1 100
15 100
28 100
7 100
4 100
16 100
27 100
3 100
11 100
5 100
4 100
4 100
9 100
16 100
5 100
17 100
30 100
23 100
1