# GloVe word cluster

we are going to check the embeddings of the words with GloVe.

given some categories like:
- algebra
- music
- numbers
- science
- technology

we are looking for the closest words which are corresponding to these categories. Then we reduce the embedding space, from 100 to 2 with t-SNE to visualize it.

In [1]:
import pandas as pd 
from plotnine import ggplot, aes, geom_text, labs
from sklearn.manifold import TSNE
import torchtext.vocab as vocab
import torch

In [2]:
# import GloVe
glove_dim = 100
glove = vocab.GloVe(name = '6B', dim = glove_dim)


In [3]:
# get closest words from word input
def get_embedding_vector(word):
    word_index = glove.stoi[word]
    word_emb = glove.vectors[word_index]
    return word_emb

def get_closest_words_from_word(word, max_n = 5):
    word_emb = get_embedding_vector(word)
    distances = [(w, torch.dist(word_emb, get_embedding_vector(w)).cpu().item()) for w in glove.itos]
    sort_dist_list = sorted(distances, key = lambda x: x[1])[:max_n]
    return [item[0] for item in sort_dist_list]

get_closest_words_from_word('football')

['football', 'soccer', 'basketball', 'league', 'rugby']

In [5]:
words = []
categories = ['numbers','algebra','music','science','technology']

df_word_cloud = pd.DataFrame({
    'category':[],
    'word':[]
})

for category in categories:
    closest_words = get_closest_words_from_word(word = category, max_n = 20)
    temp = pd.DataFrame({
        'category': [category] * len(closest_words),
        'word': closest_words
    })
    df_word_cloud = pd.concat([df_word_cloud, temp],ignore_index = True)

In [6]:
df_word_cloud

Unnamed: 0,category,word
0,numbers,numbers
1,numbers,number
2,numbers,though
3,numbers,instance
4,numbers,fact
...,...,...
95,technology,applications
96,technology,innovations
97,technology,developing
98,technology,research


In [7]:
# get the 100 dimension word embedding for all words 

n_rows = df_word_cloud.shape[0]
n_cols = glove_dim
x = torch.empty((n_rows, n_cols))
for i in range(n_rows):
    current_word = df_word_cloud.loc[i, 'word']
    x[i,:] = get_embedding_vector(current_word)
    # print(f'{i}: {current_word}')
    

In [8]:
tsne = TSNE(n_components = 2)
x_tsne = tsne.fit_transform(x.cpu().numpy())

: 

In [None]:
df_word_cloud['x'] = x_tsne[:,0]
df_word_cloud['y'] = x_tsne[:,1]

ggplot(data = df_word_cloud.sample(25)) + aes(x = 'x', y = 'y', label = 'word', color = 'category')+\
geom_text() + labs(title = 'GloVe Word Embeddings and Categories')