In [1]:
import cPickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [2]:
def generateSortedVocab():
    with open("vocab.txt", "r") as f:
        contents = [ x.strip() for x in f.readlines() ]
        contents = sorted(contents)
        with open("sorted_vocab.txt", "a+") as op:
            op.write( "\n".join(contents) )        

In [3]:
def tsnePlotPkl(mappingPkl, wordFile, color='red', flag = 0, legend=None):
    with open(mappingPkl, "r") as f:
        wvMap = cPickle.load(f)

    with open(wordFile, "r") as f:
        selectedWords = [ x.strip() for x in f.readlines() ]
        if selectedWords[-1] == '':
            del selectedWords[-1]
    wordVecMat, wordVocab = np.array([ wvMap[x] for x in selectedWords ]), np.array(selectedWords)
    if flag != 0:
        return wordVecMat, wordVocab
    plotTSNEProjection(wordVecMat, wordVocab, color, legend)

In [4]:
def tsnePlotDualPkl(embeddingPkl, mappingPkl, wordFile, color='red', flag=0, legend=None):
    with open(embeddingPkl, "r") as f:
        wVecMat = cPickle.load(f)
    with open(mappingPkl, "r") as f:
        wordVocab = (cPickle.load(f))
        revDict = dict( zip( wordVocab, range(len(wordVocab)) ) )
    
    with open(wordFile, "r") as f:
        selectedWords = [ x.strip() for x in f.readlines() ]
        if selectedWords[-1] == '':
            del selectedWords[-1]
    wordVecMat = np.array( map( lambda x : wVecMat[ revDict[x] ], filter( lambda x : x in selectedWords, wordVocab ) ) )
    if flag != 0:
        return wordVecMat, wordVocab
    plotTSNEProjection(wordVecMat, selectedWords, color, legend)

In [5]:
def plotTSNEProjection(wordVectors, wordVocab, color='red', legend=None):
    tsne = TSNE(n_components=2)
    y_pred = tsne.fit_transform(wordVectors)

    if legend != None:
        handle = plt.scatter(y_pred[:, 0], y_pred[:, 1], color=color, label=legend)
    else:
        handle = plt.scatter(y_pred[:, 0], y_pred[:, 1], color=color)
    
    for label, x, y in zip(wordVocab, y_pred[:, 0], y_pred[:, 1]):
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    
    if legend != None:
        plt.legend(handles=[handle])
    
    plt.show()

In [6]:
def plotTSNEMultiple(wv1, wv2, wv3, wv4, wordVocab, color1='red', color2='green', color3='blue', color4='cyan'):
    tsne = TSNE(n_components=2)    
    y1 = tsne.fit_transform(wv1)
    tsne = TSNE(n_components=2)
    y2 = tsne.fit_transform(wv2)
    tsne = TSNE(n_components=2)
    y3 = tsne.fit_transform(wv3)
    tsne = TSNE(n_components=2)
    y4 = tsne.fit_transform(wv4)
       
    manual_svd = plt.scatter(y1[:, 0], y1[:, 1], color=color1, label="Manual SVD")
    for label, x, y in zip(wordVocab, y1[:, 0], y1[:, 1]):
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    lib_svd = plt.scatter(y2[:, 0], y2[:, 1], color=color2, label="Library SVD")
    for label, x, y in zip(wordVocab, y2[:, 0], y2[:, 1]):
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    nplm = plt.scatter(y3[:, 0], y3[:, 1], color=color3, label="NPLM")
    for label, x, y in zip(wordVocab, y3[:, 0], y3[:, 1]):
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    cbow = plt.scatter(y4[:, 0], y4[:, 1], color=color4, label="CBOW")
    for label, x, y in zip(wordVocab, y4[:, 0], y4[:, 1]):
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.legend(handles=[manual_svd, lib_svd, nplm, cbow])
    plt.show()

In [7]:
def plotAllVectors(wordFile):
    tsnePlotPkl("word_vec_map_manual.pkl", wordFile, color='red', legend="Manual SVD")
    tsnePlotPkl("word_vec_map.pkl", wordFile, color='green', legend="Library SVD")
    tsnePlotDualPkl("embeddingMatrix.pkl", "topKWords.pkl", wordFile, color='blue', legend="NPLM")
    tsnePlotPkl("cbow_embedding.pkl", wordFile, color='cyan', legend="CBOW")

In [8]:
def plotAll(wordFile):
    wv1, wordVocab = tsnePlotPkl("word_vec_map_manual.pkl", wordFile, flag=1)
    wv2, _ = tsnePlotPkl("word_vec_map.pkl", wordFile, flag=1)
    wv3, _ = tsnePlotDualPkl("embeddingMatrix.pkl", "topKWords.pkl", wordFile, flag=1)
    wv4, _ = tsnePlotPkl("cbow_embedding.pkl", wordFile, flag=1)
    plotTSNEMultiple(wv1, wv2, wv3, wv4, wordVocab)

In [9]:
def generatePlots():
    plotAllVectors("tsne_words.txt")

    plotAllVectors("tsne_countries.txt")
    plotAll("tsne_countries.txt")

    plotAllVectors("tsne_numbers.txt")
    plotAll("tsne_numbers.txt")

    plotAllVectors("tsne_opposites.txt")
    plotAll("tsne_opposites.txt")

    plotAllVectors("tsne_synonyms.txt")

    tsnePlotPkl("word_vec_map_manual.pkl", "tsne_synonyms.txt", color='red', legend="Manual SVD")

In [10]:
plotAll("tsne_synonyms.txt")

In [None]:
if __name__ == "__main__":
    generateSortedVocab()
    generatePlots()