### Gensim

In [1]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp38-cp38-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 2.6 MB/s eta 0:00:01     |███████████████                 | 11.4 MB 348 kB/s eta 0:00:37     |█████████████████████████████▏  | 22.1 MB 1.5 MB/s eta 0:00:02
Collecting smart-open>=1.8.1
  Downloading smart_open-4.1.2-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 2.7 MB/s eta 0:00:01
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-4.1.2


In [12]:
import numpy as  np
import gensim
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
word_vectors = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)

In [4]:
v_apple = word_vectors['apple']
v_mango = word_vectors['mango']
v_india = word_vectors['india']

In [5]:
v_apple.shape

(300,)

In [6]:
cosine_similarity([v_apple], [v_mango])

array([[0.57518554]], dtype=float32)

In [7]:
cosine_similarity([v_apple], [v_india])

array([[0.17158596]], dtype=float32)

### Odd One Out

- india, mango, banana, orange

In [22]:
def odd_one_out(words):
    """
    words : list [india, mango, banana, orange]
    """
    
    all_words_vec = [ word_vectors[ele] for ele in words]
    avg_vec = np.mean(all_words_vec, axis=0)
    
#     print(avg_vec.shape)
    
    odd_word = None
    
    min_sim = 10.0
    
    for w in words:
        sim = cosine_similarity([word_vectors[w]] , [avg_vec])
        if sim < min_sim:
            min_sim = sim
            odd_word = w
    
    return odd_word

In [23]:
odd_one_out(['india', 'mango', 'banana', 'orange'])

(300,)


'india'

In [19]:
odd_one_out(['india', 'pakistan', 'america', 'cricket'])

'cricket'

In [21]:
odd_one_out(['cricket', 'hockey', 'football', 'chess'])

'chess'

## Word Analogies Task

In [28]:
words = word_vectors.vocab.keys()
len(words)

3000000

In [35]:
def predict_word(a,b,c):
    a,b,c = a.lower(), b.lower(), c.lower()
    
    # sim (b-a) = (d-c) max
    max_sim = -10.0
    d = None
    
    words = ["cow", "boy", "spanish",  'princess', "queen", "girl", "man", "india", "tokyo"] 
    
    wa, wb, wc = word_vectors[a], word_vectors[b], word_vectors[c]
    
    for w in words:
#         if w in [a,b,c]:
#             continue
        
        wd = word_vectors[w]
        
        sim = cosine_similarity( [wb-wa], [wd-wc])
        if sim>max_sim:
            max_sim = sim
            d = w
        
    return d

In [36]:
predict_word("Man", "woman", "prince")

'princess'

In [38]:
predict_word("italy", "italian", "spain")

'spanish'

### most similar method

In [40]:
word_vectors.most_similar_to_given("mango", ["apple", "india", "pen", "football"])

'apple'

In [None]:
word_vectors.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)