# Training Your Own Word2Vec Model

Word2Vec model can learn embeddings from any text corpus!
- Continuous Bag of Words Model
- Skip Gram Model

`Algorithm looks at window of target word(Y) to provide context word(X), the model is trained on (X,Y) pairs in a superwised manner.` The algorithm was developed by Tomas Mikolov.

#### Data Preparation



- Each sentence must be tokenized, into a list of words.

- The sentences can be text loaded into memory once,
or we can build a data pipeline which iteratively feeds data to the model.


In [35]:
import gensim
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [11]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [12]:
sw = set(stopwords.words('english'))

In [15]:
# read file
with open('bollywood.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
    sentences = sent_tokenize(text)

    data = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [w.lower() for w in words if w not in sw and len(w)>2]
        data.append(words)

In [17]:
print(data)

[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepika', 'ranveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'deepika', 'ranveer', 'wedding', 'style', 'file'], ['not', 'ambanis', 'deepika', 'ranveer', 'priyanka', 'nick'], ['man', 'proves', 'wedding', 'the', 'year', 'this', 'year', 'year', 'big', 'fat', 'lavish', 'extravagant', 'weddings'], ['from', 'isha', 'ambani', 'anand', 'piramal', 'deepika', 'padukone', 'ranveer', 'singh', 'priyanka', 'chopra', 'nick', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', '2018', 'saw', 'many', 'grand', 'weddings'], ['but', 'nothing', 'beats', 'man', 'wedding', 'the', 'year', 'award', 'social', 'media'], ['priyanka', 'also', 'shared', 'video', 'featuring', 'nick', 'jonaswas', 'also', 'celebrating',

In [18]:
from gensim.models import Word2Vec

In [67]:
model = Word2Vec(data, size=300, window=5, min_count=1)

In [68]:
print(model)

Word2Vec(vocab=116, size=300, alpha=0.025)


In [69]:
words = list(model.wv.vocab)

In [70]:
print(words)

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple', 'from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'style', 'file', 'not', 'ambanis', 'priyanka', 'nick', 'man', 'proves', 'year', 'this', 'big', 'fat', 'lavish', 'extravagant', 'weddings', 'isha', 'ambani', 'anand', 'piramal', 'chopra', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', 'saw', 'many', 'grand', 'but', 'nothing', 'beats', 'award', 'social', 'media', 'shared', 'video', 'featuring', 'jonaswas', 'celebrating', 'family', 'first', 'celebrated', 'christmas', 'london', 'pictures', 'new', 'outstanding', 'glimpses', 'celebration', 'verbier', 'switzerland', 'married', 'december', 'three', 'receptions', 'delhi', 'mumbai', 'jaggo', 'night', 'made', 'even', 'special', 'industry', 'friends', 'long', '

In [71]:
model.wv['deepika'].shape

(300,)

In [72]:
model.wv['ranveer']

array([ 1.44092215e-03,  2.69657292e-04,  1.50082808e-03, -8.05233431e-04,
        1.90877297e-04, -9.89122665e-04,  1.18747761e-03, -1.52962306e-03,
       -6.61236933e-04,  8.82194785e-04, -1.04016124e-03,  1.93682339e-04,
        1.32358156e-03, -1.44713395e-03,  1.33452460e-03,  1.13327603e-03,
        1.26779743e-03,  1.38444698e-03, -1.55230775e-03,  1.21903303e-03,
        1.75473644e-04,  2.65393581e-04,  1.72259170e-04,  8.50962606e-05,
       -1.61960907e-03, -5.88213734e-04,  1.60836359e-03,  9.91995985e-05,
       -1.60788535e-03,  6.71664195e-04, -5.78499225e-04,  7.95483880e-04,
        1.03943734e-04,  1.02804624e-03, -3.84471554e-04,  4.30128421e-05,
        9.53660565e-05,  1.43488683e-03,  1.15316932e-03,  9.92862042e-04,
       -1.22958489e-04,  1.33881904e-03, -2.48046970e-04, -5.52785757e-04,
        1.12698786e-03, -1.46097655e-03,  1.61723612e-04, -1.18425081e-03,
       -8.16246495e-04, -1.26065256e-03, -7.76827394e-04,  1.02788070e-03,
        1.37063267e-03, -

In [73]:
actors = ["ranveer", "deepika", "padukone", "singh", "nick", "jonas", "priyanka", "chopra", "virat", "anushka"]

def predict_word(a,b,c):
    a,b,c = a.lower(), b.lower(), c.lower()
    
    # similarity |b-a| = |d-c|  should be max
    max_sim = -100
    
    d = None
    
    
    wa,wb,wc = model.wv[a], model.wv[b], model.wv[c]
    
    for w in actors:
        if w in [a,b,c]:
            continue
        
        wd = model.wv[w]
        sim = cosine_similarity([wb-wa], [wd-wc])
        
        if sim> max_sim:
            max_sim = sim
            d = w
            
    return d

In [74]:
triad = ("nick", "priyanka", "virat")
predict_word(*triad)

'deepika'

In [75]:
triad = ("virat", "anushka", "nick")
predict_word(*triad)

'deepika'

In [76]:
triad = ("deepika", "padukone", "priyanka")
predict_word(*triad)

'chopra'