In [1]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
w2v = KeyedVectors.load_word2vec_format('../datasets/GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
# w2v['google']

In [142]:
v1 = w2v["rupee"] 
v2 = w2v["dollar"]

In [93]:
print(v1.shape)
print(v2.shape)

(300,)
(300,)


In [143]:
cosine_similarity([v1],[v2])

array([[0.62001854]], dtype=float32)

In [116]:
w2v.most_similar('disparity')

[('disparities', 0.8024708032608032),
 ('discrepancy', 0.7583654522895813),
 ('inequity', 0.7000349760055542),
 ('inequities', 0.6604803800582886),
 ('widening_gap', 0.6310858726501465),
 ('inequalities', 0.6251864433288574),
 ('imbalance', 0.6233763694763184),
 ('gap', 0.6184989213943481),
 ('racial_disparities', 0.6160184144973755),
 ('inequality', 0.6087830662727356)]

In [146]:
def find_odd_one_out(words):    

    word_vectors = [w2v[word] for word in words]
    mean_vector = np.mean(word_vectors, axis=0)
    print(mean_vector.shape)
    
    odd_one = None
    min_similarity = 1
    
    for word in words:
        sim = cosine_similarity( [w2v[word]], [mean_vector] )
        if sim < min_similarity:
            min_similarity = sim
            odd_one = word
    
        print("Similarity between mean vector and {} = {}".format(word, min_similarity))
            
    return odd_one

In [152]:
find_odd_one_out(["Apple", "Google", "Facebook", "Microsoft", "Zara"])

(300,)
Similarity between mean vector and Apple = [[0.75132143]]
Similarity between mean vector and Google = [[0.75132143]]
Similarity between mean vector and Facebook = [[0.6587838]]
Similarity between mean vector and Microsoft = [[0.6587838]]
Similarity between mean vector and Zara = [[0.44133237]]


'Zara'

In [153]:
len(w2v.vocab)

3000000

In [154]:
w2v["man"].shape

(300,)

In [60]:
# w2v.vocab.keys()

In [160]:
# Bad Implementation. Exhaustive Search
def find_analogy(words):
    # a:b :: c:?
    
    max_similarity = -1.01
    
    d = None
    
    vocabulary = w2v.vocab.keys()
    
    v_a, v_b, v_c = [w2v[w] for w in words]
    
    relation_1 = v_b - v_a
    
    # To find word d such that similarity(|v_b - v_a|, |v_d - v|c|) is maximum
    
    for word in vocabulary:
        if word in words:
            continue
        
        v_d = w2v[word]
        relation_2 = v_d - v_c
        sim = cosine_similarity([relation_1], [relation_2])
        
        if sim > max_similarity:
            max_similarity = sim
            d = word
            
    return d

In [162]:
from datetime import datetime
t0 = datetime.now()

print(find_analogy(["man", "woman", "king"]))

print("Time Taken:", datetime.now() - t0)

queen
Time Taken: 0:21:56.538407


In [164]:
# Using the built-in method
t0 = datetime.now()
print(w2v.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
print("Time Taken:", datetime.now() - t0)

[('queen', 0.7118192911148071)]
Time Taken: 0:00:01.558389


In [165]:
import re
f = open('../datasets/sherlock.txt')
text = f.read()
f.close()

data = text[3433:]

data = data.lower()
data = re.sub('[^A-Za-z.]+', ' ', data)
data = data.split('.')

for i in range(len(data)):
    data[i] = data[i].split()

In [166]:
from gensim.models import Word2Vec
model = Word2Vec(data, size=300, window=10, min_count=1)

In [167]:
model.wv.most_similar('baker')

[('street', 0.9737527370452881),
 ('station', 0.9403195977210999),
 ('hansom', 0.9368016719818115),
 ('oxford', 0.9342535138130188),
 ('gate', 0.9330577254295349),
 ('godolphin', 0.9291698932647705),
 ('strolled', 0.9271606206893921),
 ('farrington', 0.9251514077186584),
 ('breakfast', 0.9219212532043457),
 ('branching', 0.9175331592559814)]

In [168]:
model.wv.save_word2vec_format("sherlock_w2v.bin")