In [1]:
import gensim
import numpy as  np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors, word2vec

In [7]:
word_vectors = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin", binary=True)

In [15]:
v_mango = word_vectors['mango']
v_banana = word_vectors['banana']

In [18]:
cosine_similarity([v_banana],[v_banana])

array([[1.0000001]], dtype=float32)

In [35]:
def odd_one_out(words):
    all_words_vector  = [word_vectors[w] for w in words]
    
    avg_vector = np.mean(all_words_vector , axis=0)
    
    odd_one_out = None
    min_similarity = 1.0
    
    for w in words:
        sim = cosine_similarity([word_vectors[w]], [avg_vector])
        if sim < min_similarity:
            min_similarity = sim
            odd_one_out = w
            
    return odd_one_out

In [43]:
odd_one_out(input_4)

'paris'

In [37]:
input_1 = ["apple","mango","juice","party","orange"] 
input_2 = ["music","dance","sleep","dancer","food"]        
input_3  = ["match","player","football","cricket","dancer"]
input_4 = ["india","paris","russia","france","germany"]

In [53]:
import pandas as pd

In [58]:
df = pd.read_csv("Test/Test.csv").values

In [61]:
all_df =  [list(i) for i in df]

In [63]:
result = []
for i in all_df:
    result.append(odd_one_out(i))

In [64]:
result

['snake',
 'teacher',
 'cat',
 'pineapple',
 'India',
 'is',
 'was',
 'Australia',
 'Money',
 'think',
 'ship',
 'Rome',
 'Pool',
 'Egypt',
 'mouse',
 'helmet',
 'Universe',
 'Kill',
 'Club',
 'Sun']

In [65]:
df = pd.DataFrame({'OddOne':result})

In [68]:
df.to_csv("result.csv", index=False)

## 2. Word Analogy Task

In [50]:
def find_analogy(a,b,c):
    a,b,c = a.lower(), b.lower(), c.lower()
    
    wa, wb, wc = word_vectors[a], word_vectors[b],word_vectors[c]
    
    d= None
    max_sim = -100
    
    for w in word_vectors.vocab.keys():
        if w in [a,b,c]:
            continue
            
        wd = word_vectors[w]
        
        sim = cosine_similarity([wa-wb],[wd-wc])
        
        if sim > max_sim:
            max_sim = sim
            d = w
            
    return d

In [51]:
find_analogy("man","woman","king")

'clown_prince'

In [52]:
word_vectors.most_similar(positive=["woman","king"], negative=["man"], topn=1)

KeyboardInterrupt: 

## 3. Training Own Model

In [2]:
import nltk
from nltk.corpus import stopwords

In [3]:
sw = set(stopwords.words('english'))

In [4]:
file = open("bollywood.txt",'r', encoding='utf8')
file = file.read()

In [5]:
sent_token = nltk.sent_tokenize(file)

In [6]:
data = []
for sent in sent_token:
    words = nltk.word_tokenize(sent)
    words = [w.lower() for w in words if len(w)>2 and w not in sw]
    data.append(words)

In [7]:
print(data)

[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepika', 'ranveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'deepika', 'ranveer', 'wedding', 'style', 'file'], ['not', 'ambanis', 'deepika', 'ranveer', 'priyanka', 'nick'], ['man', 'proves', 'wedding', 'the', 'year', 'this', 'year', 'year', 'big', 'fat', 'lavish', 'extravagant', 'weddings'], ['from', 'isha', 'ambani', 'anand', 'piramal', 'deepika', 'padukone', 'ranveer', 'singh', 'priyanka', 'chopra', 'nick', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', '2018', 'saw', 'many', 'grand', 'weddings'], ['but', 'nothing', 'beats', 'man', 'wedding', 'the', 'year', 'award', 'social', 'media'], ['priyanka', 'also', 'shared', 'video', 'featuring', 'nick', 'jonaswas', 'also', 'celebrating',

In [8]:
from gensim.models import Word2Vec

In [21]:
model = Word2Vec(data,size=300,window=10, min_count = 1)

  "C extension not loaded, training will be slow. "


In [22]:
print(model)

Word2Vec(vocab=116, size=300, alpha=0.025)


In [23]:
words  = list(model.wv.vocab)

In [24]:
words

['deepika',
 'padukone',
 'ranveer',
 'singh',
 'wedding',
 'one',
 'biggest',
 'bollywood',
 'events',
 'happened',
 '2018',
 'the',
 'celebrations',
 'hooked',
 'phones',
 'waiting',
 'come',
 'also',
 'gave',
 'enough',
 'reason',
 'believe',
 'stylish',
 'two',
 'couple',
 'from',
 'airport',
 'looks',
 'reception',
 'parties',
 'everything',
 'entire',
 'timeline',
 'style',
 'file',
 'not',
 'ambanis',
 'priyanka',
 'nick',
 'man',
 'proves',
 'year',
 'this',
 'big',
 'fat',
 'lavish',
 'extravagant',
 'weddings',
 'isha',
 'ambani',
 'anand',
 'piramal',
 'chopra',
 'jonas',
 'kapil',
 'sharma',
 'ginni',
 'chatrath',
 'saw',
 'many',
 'grand',
 'but',
 'nothing',
 'beats',
 'award',
 'social',
 'media',
 'shared',
 'video',
 'featuring',
 'jonaswas',
 'celebrating',
 'family',
 'first',
 'celebrated',
 'christmas',
 'london',
 'pictures',
 'new',
 'outstanding',
 'glimpses',
 'celebration',
 'verbier',
 'switzerland',
 'married',
 'december',
 'three',
 'receptions',
 'delhi',

In [25]:
model.wv['deepika']

array([ 6.54790376e-04, -1.42996910e-03, -1.17495924e-03, -6.08305039e-04,
       -2.79092463e-04,  1.81849464e-05,  1.08861644e-03, -1.11284910e-03,
        1.19322131e-03,  6.43113861e-04, -1.56304298e-03, -1.51688379e-04,
       -3.45179025e-04,  2.99815350e-04,  8.00156195e-05,  8.22971924e-04,
        1.05872180e-03, -5.85068890e-04,  4.06241947e-04,  5.12084342e-04,
       -4.88362566e-04, -1.47371262e-03,  6.28699840e-04,  1.48862507e-03,
       -1.29115779e-03, -5.10995043e-04,  3.10722739e-04, -4.48817591e-04,
       -7.41055526e-04, -8.98540544e-04, -1.10568269e-03,  1.45723356e-03,
       -8.44730166e-05,  9.49555484e-04,  1.10128091e-03,  5.68656134e-04,
       -7.61021744e-04, -5.39008994e-04, -1.54762238e-03, -1.65822252e-03,
       -4.80267743e-04, -1.27190305e-03, -1.65298802e-03, -6.20207749e-04,
       -1.47833163e-03,  3.08823728e-05,  1.07155414e-03,  6.24459062e-05,
        1.19501527e-03,  1.87169877e-04,  1.00833317e-03,  2.16082262e-04,
       -1.33038883e-03, -

In [50]:
model.save("bollywood.bin")

In [52]:
a = Word2Vec.load("bollywood.bin")

#### Create model

In [33]:
def custom_analogy(a,b,c, model):
    a,b,c = a.lower(), b.lower(),c.lower()
    wa,wb,wc = model[a],model[b],model[c]
    
    d = None
    max_sim = -10
    
    actors = ["ranveer","deepika","padukone","singh","nick","jonas","chopra","priyanka","virat","anushka","ginni", "sharma"]
    
    for v in actors:
        if v in [a,b,c]:
            continue
        sim = cosine_similarity([wb-wa],[model[v] - wc])
        if sim > max_sim:
            max_sim = sim
            d = v
    return d

In [34]:
custom_analogy("deepika","padukone","anushka",model.wv)

'singh'

In [45]:

def predict_actor(a,b,c,word_vectors):
    """Accepts a triad of words, a,b,c and returns d such that a is to b : c is to d"""
    a,b,c = a.lower(),b.lower(),c.lower()
    max_similarity = -100 
    
    d = None
    words = actors
    
    wa,wb,wc = word_vectors[a],word_vectors[b],word_vectors[c]
    
    #to find d s.t similarity(|b-a|,|d-c|) should be max
    
    for w in words:
        if w in [a,b,c]:
            continue
        
        wv = word_vectors[w]
        sim = cosine_similarity([wb-wa],[wv-wc])
        
        if sim > max_similarity:
            max_similarity = sim
            d = w
    return d

In [47]:
triad = ("deepika","ranveer","priyanka")
predict_actor(*triad,model.wv)

'jonas'