# Context-free Word Embeddings

Before transformers, the state of the art in context-free embeddings was [GloVe](https://nlp.stanford.edu/projects/glove/).  You can download various sizes of embeddings from that page.

You can download the code [here](https://github.com/stanfordnlp/GloVe), but it requires compilation.  There are some demo Python scripts that come with it.

But we will use the embeddings directly, following [this blog](https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db)



In [1]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [6]:
embeddings_dict = {}
glove_path = "/Users/dkl0pjh/ML/tools/GloVe/glove.840B.300d.txt"

In [None]:
!ls -lh {glove_path}

In [None]:
with open(glove_path, 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            vector = np.asarray(values[1:], "float32")
        except:
            pass
#             print(line)
        embeddings_dict[word] = vector

In [None]:
bad_keys = []
for key in embeddings_dict.keys():
    if embeddings_dict[key].shape[0] != 300:
        print(key)
        bad_keys.append(key)
for key in bad_keys:
    del embeddings_dict[key]

In [None]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), 
                  key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [None]:
def find_closest_embeddings2(embedding):
    return sorted(embeddings_dict.keys(), 
                  key=lambda word: spatial.distance.cosine(embeddings_dict[word], embedding))

In [None]:
find_closest_embeddings(embeddings_dict["king"])[0:6]

In [None]:
find_closest_embeddings2(embeddings_dict["king"])[1:6]

In [7]:
target = embeddings_dict["Paris"] - embeddings_dict["France"] + embeddings_dict["England"]
target

KeyError: 'Paris'

In [None]:
find_closest_embeddings(target)[:5]

In [None]:
find_closest_embeddings2(target)[:5]

In [None]:
def make_analogy (words, cosine=False):
    target = embeddings_dict[words[1]] - embeddings_dict[words[0]] + embeddings_dict[words[2]]
    if cosine:
        closest = find_closest_embeddings2(target)
    else:
        closest = find_closest_embeddings(target)
    for w in closest:
        if not w in words:
            analogy = w
            break
    print(f"{words[0]} is to {words[1]} as {words[2]} is to {analogy}")

In [None]:
make_analogy(['Paris', 'France', 'London'])

In [None]:
make_analogy(['man', 'woman', 'uncle'])

In [None]:
make_analogy(['man', 'woman', 'doctor'])

In [None]:
make_analogy(['head', 'body', 'roof'])

In [None]:
make_analogy(['head', 'body', 'roof'], True)

In [None]:
make_analogy(['body', 'head', 'house'])

In [None]:
make_analogy(['body', 'head', 'house'], True)

In [None]:
make_analogy(['cold', 'ice', 'heat'])

## Plotting with t-SNE and PCA

In [3]:
words = ['man', 'woman', 'king', 'queen', 'aunt', 'uncle', 'ice', 'water', 'steam']

In [8]:
vectors = [embeddings_dict[word] for word in words]

KeyError: 'man'

In [None]:
vectors

In [None]:
tsne = TSNE(n_components=2, perplexity=3)
vectors = [embeddings_dict[word] for word in words]
Y = tsne.fit_transform(vectors)

plt.scatter(Y[:, 0], Y[:, 1])
for label, x, y in zip(words, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
plt.show()

In [None]:
from sklearn.decomposition import PCA
Y = PCA(n_components=2).fit_transform(vectors)

In [None]:
plt.scatter(Y[:, 0], Y[:, 1])
for label, x, y in zip(words, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
plt.show()