In [1]:
import torch
import torchtext.vocab

In [2]:
glove = torchtext.vocab.GloVe(name="6B", dim=100)
print(f"There are {len(glove.itos)} words in the vocabulary.")

There are 400000 words in the vocabulary.


In [3]:
glove.vectors.shape

torch.Size([400000, 100])

In [4]:
glove.itos[:15]

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'a',
 '"',
 "'s",
 'for',
 '-',
 'that',
 'on',
 'is']

In [5]:
glove.stoi["the"]

0

In [6]:
glove.stoi["dazzle"]

36623

In [7]:
glove.stoi["shenanigans"]

43165

In [8]:
def get_vector(embeddings, word):
    assert word in embeddings.stoi, f"*{word}* is not in the vocab!"

    return embeddings.vectors[embeddings.stoi[word]]

In [9]:
get_vector(glove, "paper")

tensor([-0.8503,  0.3336, -0.6589, -0.4987,  0.3659, -0.1925,  0.2566, -0.0534,
         0.3147,  0.2443,  0.2934, -0.4492,  0.1517,  0.3931, -0.3179,  0.0605,
         0.8177, -0.3885,  0.7676, -1.1041, -0.1544,  0.3165, -0.3724, -0.1148,
         0.5163, -0.3929,  0.1630, -0.2532, -0.5098,  0.1520,  0.2781,  0.5252,
        -0.3882, -0.3472, -0.6182,  0.1702,  0.1225, -0.2419, -0.3888, -0.5318,
        -0.4699, -0.7050, -0.6213, -0.3869, -0.8564, -0.4100, -0.4749, -0.2108,
        -0.8134, -0.5240,  0.4989,  0.3791,  0.5543,  1.1230, -0.4212, -1.5674,
        -0.5689,  0.4082,  1.7949,  0.1686, -0.0029,  0.2879, -0.9009, -0.0942,
         0.7999, -0.3910,  0.7629,  0.7131,  0.1319, -0.4076, -0.1869,  0.8956,
         0.4687, -0.0029,  0.0253,  1.0084,  0.1714,  0.5974, -1.1003,  0.4931,
         0.4178,  0.1728, -0.4947,  0.0878, -0.9669, -1.0920,  0.3390, -0.5129,
         0.2464,  0.2714,  0.2421, -0.2171,  0.5504,  0.0082, -0.4557,  0.1353,
        -0.0431, -0.4141,  0.7005,  0.18

In [10]:
def closest(embeddings, vector, n=6):
    distances = []
    for neighbor in embeddings.itos:
        distances.append(
            (neighbor, torch.dist(vector, get_vector(embeddings, neighbor)))
        )

    return sorted(distances, key=lambda x: x[1])[:n]

In [11]:
closest(glove, get_vector(glove, "paper"))

[('paper', tensor(0.)),
 ('papers', tensor(3.8442)),
 ('printed', tensor(4.1970)),
 ('print', tensor(4.2666)),
 ('sheet', tensor(4.3835)),
 ('printing', tensor(4.4179))]

In [14]:
closest(glove, get_vector(glove, "shenanigans"))

[('shenanigans', tensor(0.)),
 ('chicanery', tensor(2.3785)),
 ('hijinks', tensor(2.6764)),
 ('escapades', tensor(2.7821)),
 ('machinations', tensor(2.8699)),
 ('gamesmanship', tensor(2.9044))]

In [15]:
def print_tuples(tuples):
    for t in tuples:
        print("(%.4f) %s" % (t[1], t[0]))

In [16]:
print_tuples(closest(glove, get_vector(glove, "stupendous")))

(0.0000) stupendous
(2.5795) marvellous
(2.7539) frightful
(2.8506) stupefying
(2.8561) awe-inspiring
(2.9179) mind-blowing


In [17]:
print_tuples(closest(glove, get_vector(glove, "amen")))

(0.0000) amen
(4.2017) hurrah
(4.3753) ahrts
(4.4097) shuhn
(4.4148) dwayk
(4.4528) fuehrer


In [18]:
def analogy(embeddings, w1, w2, w3, n=6):
    print("\n[{} : {} :: {} : ?]".format(w1, w2, w3))
    closest_words = closest(
        embeddings,
        get_vector(embeddings, w2)
        - get_vector(embeddings, w1)
        + get_vector(embeddings, w3),
        n + 3,
    )

    closest_words = [x for x in closest_words if x[0] not in [w1, w2, w3]][:n]

    return closest_words

In [19]:
print_tuples(analogy(glove, "moon", "night", "sun"))


[moon : night :: sun : ?]
(5.7069) morning
(5.7276) afternoon
(5.8023) evening
(6.1410) hours
(6.2797) saturday
(6.3056) sunday


In [20]:
print_tuples(analogy(glove, "fly", "bird", "swim"))


[fly : bird :: swim : ?]
(5.9754) swimming
(6.2409) shark
(6.4822) dolphin
(6.5421) whale
(6.6276) cat
(6.6457) gorilla


In [26]:
print_tuples(analogy(glove, "earth", "moon", "sun"))


[earth : moon :: sun : ?]
(6.2294) lee
(6.4125) kang
(6.4644) tan
(6.4757) yang
(6.4853) lin
(6.5220) chong
