In [1]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer



# using the SQLite Table to read data.
con = sqlite3.connect('database.sqlite') 



#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 limit 1000
""", con) 




# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

In [2]:
filtered_data.shape #looking at the number of attributes and size of the data
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [4]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [5]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(998, 10)

In [6]:
#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

99.8

<b>Observation:-</b> It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions

In [7]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)
display


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [8]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]


In [9]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(998, 10)


positive    841
negative    157
Name: Score, dtype: int64

In [11]:
# find sentences containing HTML tags
i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;    

        

0
I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service!


In [10]:
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('tasty'))

{'so', "you've", 'did', 'few', 'nor', 'just', 'if', 'they', 'their', 'a', 'it', 'what', 'them', 'same', 'and', 'her', "hadn't", 'do', 'before', 'each', 'for', 'again', 'hadn', 'which', 'have', 'at', "couldn't", 'd', 's', "mustn't", 'when', 'more', 'through', 'weren', 'between', 'there', 'themselves', "you're", 'these', 'while', 'should', 'no', 'haven', 'mightn', 'wasn', "isn't", 'because', 'was', 'once', 'be', "mightn't", 'over', 'own', 'or', 'wouldn', "should've", 'about', 'll', 'he', "won't", "shan't", 'an', 'theirs', 'am', 'in', 'will', 'further', "it's", 'its', 'to', "doesn't", 'itself', 'as', 'she', 'his', 'were', 'your', 'but', 'down', 'too', 'having', 'than', 'doing', 'myself', 'y', 'under', 'ours', 'ourselves', 'ma', 'm', "aren't", 'does', 'during', 'both', 'mustn', 'yourselves', "haven't", 'against', 'himself', 'needn', 'this', "shouldn't", 'now', 'out', 'after', 'other', 'won', 'all', 'o', 'until', 'any', 'here', 'being', 'yours', 'only', 'those', 'you', 'most', "you'll", 'sh

In [12]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [24]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 

In [13]:
final.head(3) #below the processed review can be seen in the CleanedText Column 


# store final table into an SQlLite table for future.
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, flavor=None, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

# [7.2.2] Bag of Words (BoW)

In [26]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final['Text'].values)


In [27]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [28]:
final_counts.get_shape()

(998, 6033)

## [7.2.4] Bi-Grams and n-Grams.

**Motivation**

Now that we have our list of words describing positive and negative reviews lets analyse them.<br>

We begin analysis by getting the frequency distribution of the words as shown below

In [29]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ",freq_dist_positive.most_common(20))
print("Most Common Negative Words : ",freq_dist_negative.most_common(20))

Most Common Positive Words :  [(b'chip', 508), (b'flavor', 365), (b'like', 358), (b'tast', 308), (b'good', 295), (b'love', 295), (b'great', 281), (b'use', 249), (b'bag', 245), (b'one', 243), (b'food', 217), (b'tea', 213), (b'tri', 200), (b'product', 199), (b'best', 155), (b'eat', 154), (b'get', 151), (b'make', 150), (b'buy', 145), (b'price', 144)]
Most Common Negative Words :  [(b'chip', 106), (b'tast', 97), (b'like', 91), (b'product', 68), (b'bag', 63), (b'one', 56), (b'food', 53), (b'flavor', 52), (b'use', 46), (b'kettl', 46), (b'tri', 43), (b'would', 43), (b'good', 41), (b'box', 40), (b'eat', 38), (b'brand', 36), (b'look', 35), (b'dont', 34), (b'buy', 34), (b'amazon', 33)]


<b>Observation:-</b> From the above it can be seen that the most common positive and the negative words overlap for eg. 'like' could be used as 'not like' etc. <br>
So, it is a good idea to consider pairs of consequent words (bi-grams) or q sequnce of n consecutive words (n-grams)

In [30]:
#bi-gram, tri-gram and n-gram

#removing stop words like "not" should be avoided before building n-grams
count_vect = CountVectorizer(ngram_range=(1,2) ) #in scikit-learn
final_bigram_counts = count_vect.fit_transform(final['Text'].values)


In [31]:
final_bigram_counts.get_shape()

(998, 43815)

# [7.2.5] TF-IDF

In [14]:

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [15]:
final_tf_idf.get_shape()


(998, 43815)

In [16]:
features = tf_idf_vect.get_feature_names()
len(features)


43815

In [17]:
features[100000:100010]


[]

In [18]:
# covnert a row in saprsematrix to a numpy array
print(final_tf_idf[3,:].toarray()[0]) 


[0. 0. 0. ... 0. 0. 0.]


In [19]:
# source: https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)

In [20]:
top_tfidf


Unnamed: 0,feature,tfidf
0,dog,0.232083
1,inside cracked,0.175442
2,off ball,0.175442
3,half not,0.175442
4,minutes in,0.175442
5,tentacle was,0.175442
6,tentacle,0.175442
7,toy,0.175442
8,toy disappointed,0.175442
9,one tentacle,0.175442


# [7.2.6] Word2Vec

In [21]:
# Using Google News Word2Vectors
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

# in this project we are using a pretrained model by google
# its 3.3G file, once you load this into your memory 
# it occupies ~9Gb, so please do this step only if you have >12G of ram
# we will provide a pickle file wich contains a dict , 
# and it contains all our courpus words as keys and  model[word] as values
# To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 
# from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# it's 1.9GB in size.


model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)



In [40]:
model.wv['computer']


  if __name__ == '__main__':


array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [12]:
model.wv.similarity('woman', 'man')

0.76640122309953518

In [131]:
model.wv.most_similar('woman')

[('man', 0.7664012312889099),
 ('girl', 0.7494641542434692),
 ('teenage_girl', 0.7336830496788025),
 ('teenager', 0.6317086219787598),
 ('lady', 0.6288787126541138),
 ('teenaged_girl', 0.6141784191131592),
 ('mother', 0.607630729675293),
 ('policewoman', 0.6069462299346924),
 ('boy', 0.5975908041000366),
 ('Woman', 0.5770982503890991)]

In [14]:
model.wv.most_similar('tasti')  # "tasti" is the stemmed word for tasty, tastful

KeyError: "word 'tasti' not in vocabulary"

In [155]:
model.wv.most_similar('tasty') 


[('delicious', 0.8730389475822449),
 ('scrumptious', 0.8007042407989502),
 ('yummy', 0.7856923341751099),
 ('flavorful', 0.7420164346694946),
 ('delectable', 0.7385422587394714),
 ('juicy_flavorful', 0.7114803791046143),
 ('appetizing', 0.701721727848053),
 ('crunchy_salty', 0.7012301087379456),
 ('flavourful', 0.6912214159965515),
 ('flavoursome', 0.6857703328132629)]

In [137]:
model.wv.similarity('tasty', 'tast')


0.44035054190088901

In [22]:
# Train your own Word2Vec model using your own text corpus
import gensim
i=0
list_of_sent=[]
for sent in final['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):    
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue 
    list_of_sent.append(filtered_sentence)
    

In [24]:
print(final['Text'].values[0])
print("*****************************************************************")
print(list_of_sent[0])

I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service!
*****************************************************************
['i', 'dont', 'know', 'if', 'its', 'the', 'cactus', 'or', 'the', 'tequila', 'or', 'just', 'the', 'unique', 'combination', 'of', 'ingredients', 'but', 'the

In [25]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=4)    


In [26]:
words = list(w2v_model.wv.vocab)
print(len(words))

1455


In [191]:
w2v_model.wv.most_similar('tasty')

[('tastey', 0.909038245677948),
 ('satisfying', 0.8556904792785645),
 ('yummy', 0.8543208837509155),
 ('filling', 0.8233586549758911),
 ('delicious', 0.8229926228523254),
 ('flavorful', 0.8061250448226929),
 ('addicting', 0.771919846534729),
 ('delish', 0.7653154730796814),
 ('nutritious', 0.7626035213470459),
 ('tasteful', 0.7547359466552734)]

In [46]:
w2v_model.wv.most_similar('like')

  if np.issubdtype(vec.dtype, np.int):


[('eat', 0.9997897148132324),
 ('but', 0.9997896552085876),
 ('do', 0.9997698664665222),
 ('even', 0.9997590780258179),
 ('how', 0.9997547268867493),
 ('that', 0.9997442960739136),
 ('per', 0.9997331500053406),
 ('or', 0.9997247457504272),
 ('definitely', 0.9997227191925049),
 ('used', 0.99972003698349)]

In [27]:
count_vect_feat = count_vect.get_feature_names() # list of words in the BoW
count_vect_feat.index('like')
print(count_vect_feat[64055])

NameError: name 'count_vect' is not defined

# [7.2.7] Avg W2V, TFIDF-W2V

In [39]:
# average Word2Vec
# compute average word2vec for each review.

sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))
sent_vec

998
50


array([ 0.07955175, -0.02436958,  0.17058235,  0.13514131, -0.30002981,
        0.37945892, -0.18049733, -0.21179724, -0.2852362 , -0.18991219,
        0.18886622,  0.02474314, -0.10806109, -0.21039487, -0.89121005,
       -0.0557072 , -0.46302511,  0.15970458, -0.8334584 , -0.18052699,
       -0.01025608,  0.63583836,  0.11945489, -0.12762   , -0.37112765,
       -0.62301455,  0.45701785, -0.31754823, -0.19227316, -0.30769795,
       -0.07151253, -0.1968691 , -0.32359017,  0.05649076,  0.10649632,
        0.30325628, -0.13906288,  0.24784614,  0.46538584, -1.41143863,
       -0.36102903, -0.10704026, -0.49764589, -0.1616796 , -0.10057753,
       -0.09397046,  0.94281924,  0.17999168,  0.23375574, -0.09154336])

In [37]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf1 = tf_idf_vect.fit_transform(final['Text'].values)
final_tf_idf =final_tf_idf.todense()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [41]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1
    
list_of_sent
    



[['i',
  'dont',
  'know',
  'if',
  'its',
  'the',
  'cactus',
  'or',
  'the',
  'tequila',
  'or',
  'just',
  'the',
  'unique',
  'combination',
  'of',
  'ingredients',
  'but',
  'the',
  'flavour',
  'of',
  'this',
  'hot',
  'sauce',
  'makes',
  'it',
  'one',
  'of',
  'a',
  'kind',
  'we',
  'picked',
  'up',
  'a',
  'bottle',
  'once',
  'on',
  'a',
  'trip',
  'we',
  'were',
  'on',
  'and',
  'brought',
  'it',
  'back',
  'home',
  'with',
  'us',
  'and',
  'were',
  'totally',
  'blown',
  'away',
  'when',
  'we',
  'realized',
  'that',
  'we',
  'simply',
  'couldnt',
  'find',
  'it',
  'anywhere',
  'in',
  'our',
  'city',
  'we',
  'were',
  'bummed',
  'now',
  'because',
  'of',
  'the',
  'magic',
  'of',
  'the',
  'internet',
  'we',
  'have',
  'a',
  'case',
  'of',
  'the',
  'sauce',
  'and',
  'are',
  'ecstatic',
  'because',
  'of',
  'it',
  'if',
  'you',
  'love',
  'hot',
  'sauce',
  'i',
  'mean',
  'really',
  'love',
  'hot',
  'sauce'

In [44]:
weight_sum

0

In [36]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2,perplexity=7, random_state=1, learning_rate=80 )
tsne_data = model.fit_transform(tfidf_sent_vectors)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').