# Neural Word Embeddings

## With gensim and word2vec

In [1]:
import os
if not os.path.exists('GoogleNews-vectors-negative300.bin'):
    os.system('wget -nc https://lazyprogrammer.me/course_files/nlp/GoogleNews-vectors-negative300.bin.gz')
    os.system('gunzip GoogleNews-vectors-negative300.bin.gz')

In [2]:
from gensim.models import KeyedVectors

vectors = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', 
    binary=True,
)  # type: KeyedVectors

In [3]:
# a : b :: ? : d
complete_analogy = lambda a, b, d: vectors.most_similar([a, d], [b])[0][0]
analogy = lambda a, b, d: '%s : %s :: %s : %s' % (a, b, complete_analogy(a, b, d), d)

In [4]:
analogy('king', 'man', 'woman')

'king : man :: queen : woman'

In [5]:
analogy('france', 'french', 'english')

'france : french :: england : english'

In [6]:
analogy('december', 'november', 'july')

'december : november :: february : july'

In [7]:
analogy('einstein', 'scientist', 'painter')

'einstein : scientist :: jude : painter'

In [8]:
analogy('man', 'woman', 'sister')

'man : woman :: brother : sister'

In [9]:
analogy('man', 'woman', 'aunt')

'man : woman :: uncle : aunt'

In [10]:
analogy('man', 'woman', 'mom')

'man : woman :: dad : mom'

In [11]:
analogy('man', 'woman', 'actress')

'man : woman :: actor : actress'

In [12]:
analogy('nephew', 'niece', 'aunt')

'nephew : niece :: uncle : aunt'

In [13]:
# similar words
similar = lambda word: [x[0] for x in vectors.most_similar([word])]

In [14]:
similar('king')

['kings',
 'queen',
 'monarch',
 'crown_prince',
 'prince',
 'sultan',
 'ruler',
 'princes',
 'Prince_Paras',
 'throne']

In [15]:
similar('mom')

['Mom',
 'dad',
 'mother',
 'grandma',
 'mommy',
 'daddy',
 'moms',
 'Dad',
 'daughter',
 'mama']

In [16]:
similar('you')

['You',
 'your',
 'yourself',
 'I',
 'we',
 "Don'tI",
 'somebody',
 'yours',
 'ifyou',
 'youre']

In [17]:
similar('nothing')

['anything',
 'Nothing',
 'something',
 'everything',
 'nobody',
 'no',
 'anybody',
 'NOTHING',
 'never',
 'certainly']

In [18]:
similar('jesus')

['jesus_christ',
 'christ',
 'cuz_u',
 'jesse',
 'jessie',
 'lmfao',
 'hahah',
 'angelina',
 'hitler',
 'michele']

In [19]:
similar('einstein')

['nikki',
 'lmfao',
 'albert',
 'armstrong',
 'joan',
 'becky',
 'mcmahon',
 'conrad',
 'lori',
 'haley']

In [20]:
similar('japan')

['japanese',
 'tokyo',
 'america',
 'europe',
 'germany',
 'chinese',
 'india',
 'hawaii',
 'usa',
 'korea']

In [21]:
similar('february')

['january',
 'april',
 'september',
 'december',
 'july',
 'october',
 'november',
 'june',
 'feb',
 'norway']

## With GloVe

In [22]:
import numpy as np
import zipfile

In [23]:
if not os.path.exists('glove.6B.zip'):
    os.system('wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip')

In [24]:
zf = zipfile.ZipFile('glove.6B.zip')
vecfile = zf.open('glove.6B.300d.txt')
vectors = {}  # type: dict[str, np.ndarray]
for line in vecfile:
    parts = line.strip().split()
    vectors[parts[0].decode()] = np.fromiter(map(float, parts[1:]), np.float64)
vecfile.close()
zf.close()

In [25]:
len(vectors)

400000

In [26]:
def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
    """
    Calculate the cosine distance between vectors A and B.
    :param a: First vector.
    :param b: Second vector.
    :returns: The cosine distance between the two vectors.
    """
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [27]:
def most_similar(
    positive: 'list[str]' = [], negative: 'list[str]' = [], n: int = 10
) -> 'list[tuple[str, float]]':
    """
    Find the most similar word in the word embeddings based on vector arithmetic.
    :param positive: Words that contribute positively.
    :param negative: Words that contribute negatively.
    :param n: The number of words to return. Defaults to 10.
    :returns: A list of words that match, along with the similarity score.
    """
    vector = (
        sum(map(vectors.__getitem__, positive)) - 
        sum(map(vectors.__getitem__, negative))
    )
    return sorted(
        map(lambda x: (x, cosine_distance(vectors[x], vector)), vectors),
        key=lambda x: x[1],
        reverse=True
    )[1:n+1]

In [28]:
most_similar(['japan'])

[('japanese', 0.7691404009675367),
 ('tokyo', 0.6798692510962566),
 ('korea', 0.6341603444298923),
 ('china', 0.5610884241078687),
 ('asia', 0.5156282096770336),
 ('osaka', 0.5043541522145066),
 ('hashimoto', 0.5004937296110379),
 ('taiwan', 0.49872112407565194),
 ('philippines', 0.49657005762400086),
 ('thailand', 0.4923372993147826)]

In [31]:
# a : b :: ? : d
complete_analogy = lambda a, b, d: most_similar([a, d], [b])[0][0]
analogy = lambda a, b, d: '%s : %s :: %s : %s' % (a, b, complete_analogy(a, b, d), d)
similar = lambda word: [x[0] for x in most_similar([word])]

In [32]:
analogy('king', 'man', 'woman')

'king : man :: queen : woman'

In [33]:
analogy('france', 'french', 'english')

'france : french :: england : english'

In [34]:
analogy('december', 'november', 'july')

'december : november :: december : july'

In [35]:
analogy('einstein', 'scientist', 'painter')

'einstein : scientist :: picasso : painter'

In [36]:
analogy('man', 'woman', 'sister')

'man : woman :: brother : sister'

In [37]:
analogy('man', 'woman', 'aunt')

'man : woman :: uncle : aunt'

In [38]:
analogy('man', 'woman', 'mom')

'man : woman :: mom : mom'

In [39]:
analogy('man', 'woman', 'actress')

'man : woman :: actress : actress'

In [40]:
analogy('nephew', 'niece', 'aunt')

'nephew : niece :: aunt : aunt'

In [41]:
similar('king')

['queen',
 'prince',
 'monarch',
 'kingdom',
 'throne',
 'ii',
 'iii',
 'crown',
 'reign',
 'kings']

In [42]:
similar('mom')

['dad',
 'mother',
 'grandmother',
 'kids',
 'moms',
 'aunt',
 'grandma',
 'my',
 'parents',
 'girl']

In [43]:
similar('you')

["'ll", "n't", '?', 'know', 'i', 'do', 'want', 'me', 'really', 'think']

In [44]:
similar('nothing')

['anything',
 'something',
 'else',
 "n't",
 'what',
 'certainly',
 'thing',
 'really',
 'know',
 'nobody']

In [45]:
similar('jesus')

['christ',
 'crucifixion',
 'god',
 'resurrection',
 'crucified',
 'disciples',
 'latter-day',
 'holy',
 'apostles',
 'baptism']

In [46]:
similar('einstein')

['relativity',
 'bohr',
 'physicists',
 'heisenberg',
 'sigmund',
 'freud',
 'equations',
 'theory',
 'physics',
 'physicist']

In [47]:
similar('japan')

['japanese',
 'tokyo',
 'korea',
 'china',
 'asia',
 'osaka',
 'hashimoto',
 'taiwan',
 'philippines',
 'thailand']

In [48]:
similar('february')

['october',
 'december',
 'january',
 'november',
 'april',
 'august',
 'september',
 'june',
 'july',
 'march']