# t-SNE Visualization on Amazon Food Review Dataset

## Import Required Modules

In [1]:
import os # for file management
import shutil # for file management
from pathlib import Path
import sqlite3
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import time # for time measurement
import imageio # for GIF creation

from sklearn.feature_extraction.text import CountVectorizer # for Bag Of Words
from sklearn.feature_extraction.text import TfidfVectorizer # for text to vector creation
from gensim.models import Word2Vec

from sklearn.preprocessing import StandardScaler # for Column Standardization - DO WE NEED THIS?
from sklearn.manifold import TSNE # for t-SNE

In [2]:
## Configure Matplotlib for nice image in PDF
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
plt.rcParams['savefig.dpi'] = 75
plt.rcParams['figure.figsize'] = 10,6
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 10
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8

In [3]:
output_dir = 'Output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Load Data

In [4]:
con = sqlite3.connect('./cleaned.sqlite')

df = pd.read_sql_query(""" SELECT * from Reviews""", con)
df.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,1,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,1,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...
3,138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,1,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,b'great littl book read nice rhythm well good ...
4,138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,1,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,b'book poetri month year goe month cute littl ...


In [5]:
df.describe()

Unnamed: 0,index,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,364106.0,364106.0,364106.0,364106.0,364106.0,364106.0
mean,261221.056821,282777.564772,1.738411,2.186231,0.843164,1296157000.0
std,152361.122483,164601.735167,6.716471,7.339767,0.363647,48598210.0
min,0.0,1.0,0.0,0.0,0.0,939340800.0
25%,129625.25,140699.25,0.0,0.0,1.0,1270858000.0
50%,257307.5,278947.5,0.0,1.0,1.0,1311379000.0
75%,396338.75,428557.75,2.0,2.0,1.0,1332893000.0
max,525813.0,568454.0,866.0,878.0,1.0,1351210000.0


In [6]:
df.dtypes

index                      int64
Id                         int64
ProductId                 object
UserId                    object
ProfileName               object
HelpfulnessNumerator       int64
HelpfulnessDenominator     int64
Score                      int64
Time                       int64
Summary                   object
Text                      object
CleanedText               object
dtype: object

In [7]:
# Split data
# positive review score, negative review score and review text as seperate dataframes
df_text = df['CleanedText']
print(df_text.shape)
df_text.head()

(364106,)


0    b'witti littl book make son laugh loud recit c...
1    b'grew read sendak book watch realli rosi movi...
2    b'fun way children learn month year learn poem...
3    b'great littl book read nice rhythm well good ...
4    b'book poetri month year goe month cute littl ...
Name: CleanedText, dtype: object

In [8]:
def genTSNEGif(std_data, ndp, p, itr_list, file_prefix, closePlt=False):
    '''
    Fuction which genrate t-SNE visualtion for each itr_list using given ndp and p
    Generates a GIF and stores it under '{img_name}.gif'
    Where:
        std_data - Column Standardized Data
        ndp - Number of Data Points to consider in std_data
        p - Perplexity
        itr_list - List of iterations, each iteration will be a frame in GIF
        file_prefix - Prefix to the name of GIF image
        closePlt - If you do not want to display the generated image in Notebook
    '''
    image_name = '{0}_tsne_ndp_{1}_p_{2}.gif'.format(file_prefix,ndp,p)
    print('No.Of Data Points - {0}, Perplexity - {1}, Iterations - {2}, ImageName - {3}'.format(
            ndp, p, itr_list, image_name))
    
    # list to hold the frames
    frames = []
    p_data = std_data
    p_labels = final_reviews_scores[0:ndp]
    
    #print('t-SNE Data Points {0} and its Labels {1}'.format(p_data.shape, p_labels.shape))
    for itr_val in itr_list:
        img_title = '{0}-ndp={1} p={2} itr={3}'.format(file_prefix, ndp, p, itr_val)
        
        time_start = time.time()
        
        model = TSNE(n_components=2,random_state=0,perplexity=p,n_iter=itr_val) # ,verbose=2
        tsne_data = model.fit_transform(p_data)
        time_elapsed = time.time() - time_start
        print('{0} ==> t-SNE done! Time elapsed: {1} seconds'.format(img_title, time.time() - time_start))
    
        tsne_data = np.vstack((tsne_data.T,p_labels)).T
        #print(tsne_data.shape)
        #tsne_data[:4]
        tsne_df = pd.DataFrame(tsne_data,columns=['Dim_1','Dim_2','Score'])
        #tsne_df.head()
        g = sns.FacetGrid(tsne_df,hue='Score',height=10).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend();
        g.fig.suptitle(img_title);
        g.fig.canvas.draw();
        image = np.frombuffer(g.fig.canvas.tostring_rgb(), dtype='uint8')
        image = image.reshape(g.fig.canvas.get_width_height()[::-1] + (3,))
        frames.append(image)
        
        if (closePlt == True):
            plt.close()
    
    kwargs_write = {'fps':1.0, 'quantizer':'nq'}
    imageio.mimsave(Path.cwd() / output_dir / image_name, frames, fps=1)
    
    return

## Training Data for Visualization - 3K Points

In [9]:
# we can't process all 364K revies, selecting a subset of it
total_data_set_size = 500

# Create a Balanced dataset having both +ive and -ive reviews
df_positive_reviews = df[df.Score == 1].sample(int(total_data_set_size/2))
df_negative_reviews = df[df.Score == 0].sample(int(total_data_set_size/2))

final_reviews = pd.concat([df_positive_reviews, df_negative_reviews])
final_reviews_scores = final_reviews['Score']

print('Shape of Training Data {0}'.format(final_reviews.shape))
print('Shape of Training Label {0}'.format(final_reviews_scores.shape))

Shape of Training Data (500, 12)
Shape of Training Label (500,)


In [10]:
final_reviews.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
146239,49378,53627,B0016687F2,A56T9S9XCZI80,cathy,0,0,1,1344297600,Was delivered in 1 day!!!,I purchased this for my Marine Corps husband t...,b'purchas marin corp husband put care packag d...
200081,250182,271258,B001IZHZGS,A2ZKXGAW9W0C40,"Christopher A. Dowling ""tintinet""",3,3,1,1283644800,"Best gum currently available, IMO.","The perfect size, nice, long lasting flavor. ...",b'perfect size nice long last flavor coat does...
2935,238287,258517,B0000CNTZI,A1Q2AD6OYWO4S8,"chad-roscoe ""chad-roscoe""",0,0,1,1293408000,The scent of heaven,I first had this tea at a vegetarian Vietnames...,b'first tea vegetarian vietnames restaur order...
191631,52669,57206,B001EYUE2A,A2N9B0XGETYOLO,M. ZELLARS,1,1,1,1311206400,Coffee,This is my favorite coffee for my Keurig. Thi...,b'favorit coffe keurig blend smooth pleas high...
316739,432232,467431,B004M8KV6Y,A3OQ054KF9C1K8,averagepunter,10,12,1,1304553600,Might just save my Tassimo from the scrapheap ...,The demise of the relationship between Kraft a...,b'demis relationship kraft starbuck threaten m...


# Bag of Words (BoW)

In [11]:
# Create Vectors
count_vect = CountVectorizer(ngram_range=(1,2)) # create an instance
final_counts = count_vect.fit_transform(final_reviews['CleanedText'].values)
print('Shape of BoW Vectorizer: ', final_counts.get_shape())
print('Total no.of unique words: ', final_counts.get_shape()[1])

# Standardize the Data
standardized_data = StandardScaler().fit_transform(final_counts.toarray().astype(np.float64)) #, with_mean=False
print('Shape of Standardized data', standardized_data.shape)

Shape of BoW Vectorizer:  (500, 19908)
Total no.of unique words:  19908
Shape of Standardized data (500, 19908)


In [12]:
genTSNEGif(standardized_data, len(standardized_data), 30, range(1000,3001,1000), 'BoW-std',closePlt=True)
dense_mat = final_counts.toarray().astype(np.float64)

for p in range(10, 101, 10):
    genTSNEGif(dense_mat, len(dense_mat), p, range(1000,5001,1000), 'BoW',closePlt=True)

No.Of Data Points - 500, Perplexity - 30, Iterations - range(1000, 3001, 1000), ImageName - BoW-std_tsne_ndp_500_p_30.gif
BoW-std-ndp=500 p=30 itr=1000 ==> t-SNE done! Time elapsed: 16.620235443115234 seconds
BoW-std-ndp=500 p=30 itr=2000 ==> t-SNE done! Time elapsed: 20.292940855026245 seconds
BoW-std-ndp=500 p=30 itr=3000 ==> t-SNE done! Time elapsed: 21.018538236618042 seconds
No.Of Data Points - 500, Perplexity - 10, Iterations - range(1000, 5001, 1000), ImageName - BoW_tsne_ndp_500_p_10.gif
BoW-ndp=500 p=10 itr=1000 ==> t-SNE done! Time elapsed: 14.674026012420654 seconds
BoW-ndp=500 p=10 itr=2000 ==> t-SNE done! Time elapsed: 18.46445941925049 seconds
BoW-ndp=500 p=10 itr=3000 ==> t-SNE done! Time elapsed: 18.840811252593994 seconds
BoW-ndp=500 p=10 itr=4000 ==> t-SNE done! Time elapsed: 18.71005940437317 seconds
BoW-ndp=500 p=10 itr=5000 ==> t-SNE done! Time elapsed: 19.041726112365723 seconds
No.Of Data Points - 500, Perplexity - 20, Iterations - range(1000, 5001, 1000), ImageN

# TFIDF

In [13]:
# Create Vectors

tf_idf_vec = TfidfVectorizer(ngram_range=(1,2))
final_counts = tf_idf_vec.fit_transform(final_reviews['CleanedText'].values)

#.fit_transform(final_reviews['CleanedText'].values)
print('Shape of BoW Vectorizer: ', final_counts.get_shape())
print('Total no.of unique words: ', final_counts.get_shape()[1])

# Standardize the Data
standardized_data = StandardScaler().fit_transform(final_counts.toarray().astype(np.float64)) #, with_mean=False
print('Shape of Standardized data', standardized_data.shape)

Shape of BoW Vectorizer:  (500, 19908)
Total no.of unique words:  19908
Shape of Standardized data (500, 19908)


In [16]:
genTSNEGif(standardized_data, len(standardized_data), 30, range(1000,6001,1000), 'tfidf-std',closePlt=True)

No.Of Data Points - 500, Perplexity - 30, Iterations - range(1000, 6001, 1000), ImageName - tfidf-std_tsne_ndp_500_p_30.gif
tfidf-std-ndp=500 p=30 itr=1000 ==> t-SNE done! Time elapsed: 17.061641216278076 seconds
tfidf-std-ndp=500 p=30 itr=2000 ==> t-SNE done! Time elapsed: 22.879101276397705 seconds
tfidf-std-ndp=500 p=30 itr=3000 ==> t-SNE done! Time elapsed: 27.63390612602234 seconds
tfidf-std-ndp=500 p=30 itr=4000 ==> t-SNE done! Time elapsed: 30.708401203155518 seconds
tfidf-std-ndp=500 p=30 itr=5000 ==> t-SNE done! Time elapsed: 30.955078840255737 seconds
tfidf-std-ndp=500 p=30 itr=6000 ==> t-SNE done! Time elapsed: 30.72924494743347 seconds


In [19]:
dense_mat = final_counts.toarray().astype(np.float64)
for p in range(10, 61, 10):
    genTSNEGif(dense_mat, len(dense_mat), p, range(1000,6001,1000), 'tfidf',closePlt=True)

No.Of Data Points - 500, Perplexity - 10, Iterations - range(1000, 6001, 1000), ImageName - tfidf_tsne_ndp_500_p_10.gif
tfidf-ndp=500 p=10 itr=1000 ==> t-SNE done! Time elapsed: 15.436394453048706 seconds
tfidf-ndp=500 p=10 itr=2000 ==> t-SNE done! Time elapsed: 18.523593187332153 seconds
tfidf-ndp=500 p=10 itr=3000 ==> t-SNE done! Time elapsed: 22.647663116455078 seconds
tfidf-ndp=500 p=10 itr=4000 ==> t-SNE done! Time elapsed: 26.716687202453613 seconds
tfidf-ndp=500 p=10 itr=5000 ==> t-SNE done! Time elapsed: 30.898597717285156 seconds
tfidf-ndp=500 p=10 itr=6000 ==> t-SNE done! Time elapsed: 34.807520389556885 seconds
No.Of Data Points - 500, Perplexity - 20, Iterations - range(1000, 6001, 1000), ImageName - tfidf_tsne_ndp_500_p_20.gif
tfidf-ndp=500 p=20 itr=1000 ==> t-SNE done! Time elapsed: 14.992748975753784 seconds
tfidf-ndp=500 p=20 itr=2000 ==> t-SNE done! Time elapsed: 18.98304295539856 seconds
tfidf-ndp=500 p=20 itr=3000 ==> t-SNE done! Time elapsed: 22.846832513809204 seco

KeyboardInterrupt: 

# Word2Vec

I am creating vectors having 50 dimensions.
Just a random value, not inherent calculation I made on this size decision.

In [20]:
# Create List arry for creating own W2V
list_of_sent = []
for sent in final_reviews['CleanedText'].values:
    list_of_sent.append(sent.decode("utf-8").split())
    
print(final_reviews.CleanedText.values[0])
print(len(list_of_sent), list_of_sent[0])

b'purchas marin corp husband put care packag deploy afghanistan deliv day couldnt believ thought email made mistak nope even hour later doorstep talk fast deliveri'
500 ['purchas', 'marin', 'corp', 'husband', 'put', 'care', 'packag', 'deploy', 'afghanistan', 'deliv', 'day', 'couldnt', 'believ', 'thought', 'email', 'made', 'mistak', 'nope', 'even', 'hour', 'later', 'doorstep', 'talk', 'fast', 'deliveri']


In [21]:
# Required dimension
w2v_d = 50

# Considering words that are occured atleast 5 times in the corpus
w2v_model = Word2Vec(list_of_sent, min_count=5, size=w2v_d, workers=4)

w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times : ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times :  884
sample words  ['purchas', 'husband', 'put', 'care', 'packag', 'deliv', 'day', 'couldnt', 'believ', 'thought', 'made', 'even', 'hour', 'later', 'fast', 'deliveri', 'perfect', 'size', 'nice', 'long', 'last', 'flavor', 'coat', 'doesnt', 'get', 'also', 'first', 'tea', 'restaur', 'order', 'green', 'came', 'pot', 'could', 'smell', 'alreadi', 'fact', 'find', 'tast', 'know', 'like', 'actual', 'usual', 'one', 'expens', 'love', 'experi', 'wish', 'decaf', 'favorit']


## Avg-W2V

In [22]:
# Computing average w2v for each review in selected training dataset
review_vectors = []
for sent in tqdm(list_of_sent, ascii=True):
    sent_vec = np.zeros(w2v_d) # array to hold the vectors. Initially assuming no vectors in this review
    no_of_words_in_review = 0 # number of words with valid vector in this review
    
    # count all the words (that are in w2v model) and take average
    for word in sent:
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            no_of_words_in_review += 1
    if no_of_words_in_review != 0:
        sent_vec /= no_of_words_in_review
    review_vectors.append(sent_vec)
    
print(len(review_vectors))
print(len(review_vectors[0]))

100%|##########| 500/500 [00:00<00:00, 1062.10it/s]

500
50





In [23]:
# t-SNE using Average Word2Vec

#genTSNEGif(review_vectors, len(review_vectors), 30, range(1000,10001,1000), 'avg-w2v')

for p in range(10, 101, 10):
    genTSNEGif(review_vectors, len(review_vectors), p, range(1000,5001,1000), 'avg-w2v',closePlt=True)

No.Of Data Points - 500, Perplexity - 10, Iterations - range(1000, 5001, 1000), ImageName - avg-w2v_tsne_ndp_500_p_10.gif
avg-w2v-ndp=500 p=10 itr=1000 ==> t-SNE done! Time elapsed: 2.695676803588867 seconds
avg-w2v-ndp=500 p=10 itr=2000 ==> t-SNE done! Time elapsed: 4.945517301559448 seconds
avg-w2v-ndp=500 p=10 itr=3000 ==> t-SNE done! Time elapsed: 7.169031143188477 seconds
avg-w2v-ndp=500 p=10 itr=4000 ==> t-SNE done! Time elapsed: 9.788629531860352 seconds
avg-w2v-ndp=500 p=10 itr=5000 ==> t-SNE done! Time elapsed: 11.720644235610962 seconds
No.Of Data Points - 500, Perplexity - 20, Iterations - range(1000, 5001, 1000), ImageName - avg-w2v_tsne_ndp_500_p_20.gif
avg-w2v-ndp=500 p=20 itr=1000 ==> t-SNE done! Time elapsed: 2.78783917427063 seconds
avg-w2v-ndp=500 p=20 itr=2000 ==> t-SNE done! Time elapsed: 5.255230665206909 seconds
avg-w2v-ndp=500 p=20 itr=3000 ==> t-SNE done! Time elapsed: 7.647207021713257 seconds
avg-w2v-ndp=500 p=20 itr=4000 ==> t-SNE done! Time elapsed: 8.593375

  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=90 itr=1000 ==> t-SNE done! Time elapsed: 3.7089908123016357 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=90 itr=2000 ==> t-SNE done! Time elapsed: 3.550213098526001 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=90 itr=3000 ==> t-SNE done! Time elapsed: 3.5614142417907715 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=90 itr=4000 ==> t-SNE done! Time elapsed: 3.559083938598633 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=90 itr=5000 ==> t-SNE done! Time elapsed: 3.5471158027648926 seconds
No.Of Data Points - 500, Perplexity - 100, Iterations - range(1000, 5001, 1000), ImageName - avg-w2v_tsne_ndp_500_p_100.gif


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=100 itr=1000 ==> t-SNE done! Time elapsed: 3.695096015930176 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=100 itr=2000 ==> t-SNE done! Time elapsed: 3.6859006881713867 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=100 itr=3000 ==> t-SNE done! Time elapsed: 3.707569122314453 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=100 itr=4000 ==> t-SNE done! Time elapsed: 3.692902088165283 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


avg-w2v-ndp=500 p=100 itr=5000 ==> t-SNE done! Time elapsed: 3.7627687454223633 seconds


## TFIDF Weighted W2V

Computing tfidf weighted w2v over the selected training dataset

In [24]:
# Create tf-idf vector matrix
tf_idf_model = TfidfVectorizer(ngram_range=(1,2))
tf_idf_matrix = tf_idf_model.fit_transform(final_reviews['CleanedText'].values)

# Create dictionary having words (features) as keys, its tf-idf values as values
tf_idf_dict = dict(zip(tf_idf_model.get_feature_names(), list(tf_idf_model.idf_)))
len(tf_idf_dict)

19908

In [25]:
tf_idf_feat = tf_idf_model.get_feature_names()

# Computing tf-idf weighted w2v for each review in selected training dataset
review_vectors = []
for sent in tqdm(list_of_sent, ascii=True):
    sent_vec = np.zeros(w2v_d) # array to hold the vectors
    no_of_words_in_review = 0 # number of words with valid vector in this review
    
    # count all the words (that are in w2v model) and take average
    for word in sent:
        if word in w2v_words:
            vec = w2v_model.wv[word]
            # calculate tf-idf weighted w2v value for this word
            tf_idf = tf_idf_dict[word] * (sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            no_of_words_in_review += 1
    if no_of_words_in_review != 0:
        sent_vec /= no_of_words_in_review
    review_vectors.append(sent_vec)
    
print(len(review_vectors))
print(len(review_vectors[0]))

100%|##########| 500/500 [00:00<00:00, 831.26it/s]

500
50





In [26]:
# t-SNE using tf-idf weighted s2v

for p in range(10, 101, 10):
    genTSNEGif(review_vectors, len(review_vectors), p, range(1000,5001,1000), 'tfidf-weighted-w2v',closePlt=True)

No.Of Data Points - 500, Perplexity - 10, Iterations - range(1000, 5001, 1000), ImageName - tfidf-weighted-w2v_tsne_ndp_500_p_10.gif
tfidf-weighted-w2v-ndp=500 p=10 itr=1000 ==> t-SNE done! Time elapsed: 2.5075390338897705 seconds
tfidf-weighted-w2v-ndp=500 p=10 itr=2000 ==> t-SNE done! Time elapsed: 4.713393211364746 seconds
tfidf-weighted-w2v-ndp=500 p=10 itr=3000 ==> t-SNE done! Time elapsed: 6.844162702560425 seconds
tfidf-weighted-w2v-ndp=500 p=10 itr=4000 ==> t-SNE done! Time elapsed: 9.025461196899414 seconds
tfidf-weighted-w2v-ndp=500 p=10 itr=5000 ==> t-SNE done! Time elapsed: 11.266832113265991 seconds
No.Of Data Points - 500, Perplexity - 20, Iterations - range(1000, 5001, 1000), ImageName - tfidf-weighted-w2v_tsne_ndp_500_p_20.gif
tfidf-weighted-w2v-ndp=500 p=20 itr=1000 ==> t-SNE done! Time elapsed: 2.715935468673706 seconds
tfidf-weighted-w2v-ndp=500 p=20 itr=2000 ==> t-SNE done! Time elapsed: 4.954932928085327 seconds
tfidf-weighted-w2v-ndp=500 p=20 itr=3000 ==> t-SNE do

  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=90 itr=1000 ==> t-SNE done! Time elapsed: 4.156220197677612 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=90 itr=2000 ==> t-SNE done! Time elapsed: 4.141205072402954 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=90 itr=3000 ==> t-SNE done! Time elapsed: 4.069101095199585 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=90 itr=4000 ==> t-SNE done! Time elapsed: 4.1565775871276855 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=90 itr=5000 ==> t-SNE done! Time elapsed: 4.461912155151367 seconds
No.Of Data Points - 500, Perplexity - 100, Iterations - range(1000, 5001, 1000), ImageName - tfidf-weighted-w2v_tsne_ndp_500_p_100.gif


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=100 itr=1000 ==> t-SNE done! Time elapsed: 5.4860756397247314 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=100 itr=2000 ==> t-SNE done! Time elapsed: 5.561805248260498 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=100 itr=3000 ==> t-SNE done! Time elapsed: 5.460094928741455 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=100 itr=4000 ==> t-SNE done! Time elapsed: 5.468733072280884 seconds


  result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind


tfidf-weighted-w2v-ndp=500 p=100 itr=5000 ==> t-SNE done! Time elapsed: 5.332413673400879 seconds
