# GloVe Clustering

In [1]:
from lib.datasets import load_wikipedia_wordvecs
from lib.spectral_clustering import similarity_matrix, laplacian_matrix, spectral_clustering
import numpy as np
import matplotlib.pyplot as plt

In [2]:
words, vecs = load_wikipedia_wordvecs().values()

In [3]:
def similarity_matrix(data, s=1):
    n = len(data)
    scale = [np.sum(np.linalg.norm( data - x.reshape((1, -1)), axis=0)) for x in data ]
    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            similarity_matrix[i][j] = s * np.linalg.norm(data[i] - data[j]) / (scale[i]+scale[j])
    return similarity_matrix

In [4]:
codenames = "penguin,Germany,spy,battery,stadium,opera,shop,ambulance,brush,forest,Mexico,beat,fire,whip,\
switch,horse,band,deck,concert,horn,link,charge,row,line,lock".split(",")
indices = np.array([words.index(c) for c in codenames])


word_vecs = vecs[indices]

s = similarity_matrix(word_vecs)


In [5]:
k = 14
assns, (evals, evecs) = spectral_clustering(word_vecs, k=k, lform = "rw", metric="g", s=0.5, with_eigen=True)

In [6]:
evals

array([-1.66553831e-44, -6.29299062e-48, -4.77148844e-50, -1.31014512e-52,
       -3.58188440e-53, -4.81077410e-57, -1.79874631e-57,  2.46392021e-61,
        2.46392021e-61,  2.46392021e-61,  2.46392021e-61,  2.46392021e-61,
        2.46392021e-61,  2.46392021e-61])

In [7]:
assns

array([ 1, 12, 10,  2,  9,  4, 13,  0,  6,  7, 11, 10,  0,  6,  8, 12,  3,
        4,  3,  3,  5,  2,  8, 10,  8])

In [187]:
[[codenames[j] for j in range(len(codenames)) if assns[j] == i] for i in range(k)]

[['spy', 'beat', 'line'],
 ['opera'],
 ['stadium'],
 ['switch', 'lock'],
 ['ambulance', 'fire'],
 ['deck'],
 ['band', 'concert'],
 ['row'],
 ['Mexico'],
 ['shop'],
 ['horse'],
 ['penguin', 'Germany', 'forest', 'horn', 'link'],
 ['brush', 'whip'],
 ['battery', 'charge']]

In [188]:
def avg_vec(words, candidates, vocabulary):
    # words is a list of word strings
    # candidates is a Nx300 dataset of all candidate word vectors
    # vocabulary is a list of word strings, 1 per vector in candidate
    mean = np.mean([candidates[vocabulary.index(word)] for word in words], axis=0).reshape((1, -1))
    diff = np.linalg.norm(candidates - mean, axis=0)
    return vocabulary[np.argmin(diff)]

In [189]:
avg_vec(('fly', 'Moscow'), vecs, words)

'hold'