In [2]:
import torch
embed = torch.load('10k_word_embeddings.tar')
vocab_to_int = torch.load('vocab_to_int.tar')

In [35]:
embeddings_profit = embed[vocab_to_int['profit']]
embeddings_profit.shape

(300,)

In [33]:
import plotly
import numpy as np
import plotly.graph_objs as go
from sklearn.decomposition import PCA

def display_pca_scatterplot_3D(model, vocab, user_input=None, words=None, label=None, color_map=None, topn=5, sample=10):

    if words == None:
        if sample > 0:
            words = np.random.choice(list(vocab.keys()), sample)
        else:
            words = [ word for word in vocab]
    word_vectors = np.array([model[vocab[w]] for w in words])
    
    three_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:2]
    # For 2D, change the three_dim variable into something like two_dim like the following:
#     two_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:2]

    data = []
    count = 0
    
    for i in range (len(user_input)):

                trace = go.Scatter(
                    x = three_dim[count:count+topn,0], 
                    y = three_dim[count:count+topn,1],  
                    text = words[count:count+topn],
                    name = user_input[i],
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 0.8,
                        'color': 2
                    }
       
                )
                
                # For 2D, instead of using go.Scatter3d, we need to use go.Scatter and delete the z variable. Also, instead of using
                # variable three_dim, use the variable that we have declared earlier (e.g two_dim)
            
                data.append(trace)
                count = count+topn

    trace_input = go.Scatter(
                    x = three_dim[count:,0], 
                    y = three_dim[count:,1],  
                    text = words[count:],
                    name = 'input words',
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 1,
                        'color': 'black'
                    }
                    )

    # For 2D, instead of using go.Scatter3d, we need to use go.Scatter and delete the z variable.  Also, instead of using
    # variable three_dim, use the variable that we have declared earlier (e.g two_dim)
            
    data.append(trace_input)
    
# Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 1000
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()
    
display_pca_scatterplot_3D(embed, vocab_to_int, ["tesla", "meta"], words=['tesla', 'meta',  'nissan',
 'roadster',
 'camera',
 'roadsters',
 'bicycles',
 'nvidia',
 'porsche',
 'bmw',
 'speck','search',
 'bankrate',
 'google',
 'app',
 'bing',
 'yahoo',
 'web',
 'keywords',
 'keyword',])



In [31]:
from numpy import dot
from numpy.linalg import norm
import pandas as pd

def cosine_sim(a, b): 
    a = embed[vocab_to_int[a]]
    b = embed[vocab_to_int[b]]
    return abs(dot(a, b)/(norm(a)*norm(b)))

rank = []
input_word = "tesla"
for word in vocab_to_int.keys():
    rank.append([word,cosine_sim(input_word, word)])
pd.DataFrame(rank, columns=["word", "cos"]).sort_values("cos", ascending=False).head(10)["word"].tolist()

['tesla',
 'nissan',
 'roadster',
 'camera',
 'roadsters',
 'bicycles',
 'nvidia',
 'porsche',
 'bmw',
 'speck']