# Word Embeddings

In [1]:
import pandas as pd
import numpy as np

from glove import Glove
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
embedding_model = Glove.load_stanford('/mnt/ml-team/minerva/pretrained/glove.twitter.27B.50d.txt')

In [3]:
embedding_model.word_vectors.shape

(1193514, 50)

In [4]:
embedding_model.dictionary['apple']

1881

In [5]:
embedding_model.dictionary['tree']

3341

In [6]:
embedding_model.word_vectors[12]

array([  5.85600000e-01,  -3.69640000e-01,   6.34800000e-02,
         8.16750000e-01,   1.80250000e-01,   1.57370000e-01,
         4.56830000e-01,   7.10000000e-01,   2.59260000e-02,
         1.65250000e+00,   5.22630000e-01,  -7.22200000e-04,
        -2.54560000e+00,   1.11840000e-01,  -1.91340000e-01,
        -1.11230000e-01,  -2.12640000e-01,   2.00260000e-01,
         2.31950000e-01,  -6.98620000e-01,   1.30720000e-01,
         2.89150000e-01,  -8.01590000e-01,  -1.79840000e-01,
         1.03380000e+00,  -1.11680000e+00,  -2.15190000e-01,
        -7.47140000e-02,   7.92850000e-01,  -1.23780000e-01,
        -5.74510000e-01,   4.91380000e-01,  -1.30870000e+00,
         6.36270000e-01,   3.26560000e-01,   5.14370000e-01,
        -2.98680000e-01,  -4.11720000e-02,  -3.52770000e-02,
        -7.79260000e-01,  -2.21040000e+00,  -2.89090000e-01,
        -8.62570000e-02,   2.36370000e-01,   2.78310000e-01,
         2.09150000e-02,  -2.14610000e-01,   5.66560000e-01,
        -3.64040000e-01,

In [7]:
def similarity(glove_instance, word1, word2):
        ids = [glove_instance.dictionary[w] for w in [word1, word2]]
        vec1, vec2 = [glove_instance.word_vectors[i].reshape(1,-1) for i in ids]
        return cosine_similarity(vec1, vec2)[0][0]

similarity(embedding_model, 'orange', 'fire')

0.64002566245727655

In [8]:
embedding_model.most_similar('war', 20)

[('battle', 0.75805667486763095),
 ('wars', 0.74856592737144523),
 ('hunger', 0.74185530217795237),
 ('death', 0.74021377220782814),
 ('also', 0.73600199881899764),
 ('land', 0.72875225468552862),
 ('of', 0.72164431730084133),
 ('revolution', 0.71980641113138932),
 ('an', 0.71426458082296751),
 ('laden', 0.70881477615975541),
 ('killing', 0.70314928262776055),
 ('man', 0.70253268138253688),
 ('the', 0.69718906040429651),
 ('halt', 0.69696662375956842),
 ('will', 0.69588754113263984),
 ('â€™s', 0.69125647572506488),
 ('fallen', 0.68941965526320736),
 ('attack', 0.68937456389049157),
 ('chaos', 0.68697709467212231)]

In [9]:
with open('twitter_sentiment/frequency_list.txt','r+') as f:
    most_frequent = f.read().lower().split('\n')
interesting = ['apple','tree', 'human', 'child']
word_corpus = list(set(list(np.random.choice(most_frequent, 100)) + interesting))

In [10]:
%load_ext autoreload
% autoreload 2
from unboxer.bokeh_tsne.tsne_vis import TsneText

tsne_text = TsneText(embedding_model, verbose=1)
tsne_text.fit(word_corpus, highlight_words = interesting)

Using TensorFlow backend.


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.000s...
[t-SNE] Computed neighbors for 100 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 1.775393
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.053955
[t-SNE] Error after 1000 iterations: 0.522973


In [11]:
tsne_text.plot()