In [6]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
# from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics import confusion_matrix
# from sklearn import metrics
# from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

In [7]:
# using the SQLite Table to read data.
con = sqlite3.connect('./database.sqlite') 

In [8]:
#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""", con)

In [9]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [10]:
filtered_data.dtypes

Id                         int64
ProductId                 object
UserId                    object
ProfileName               object
HelpfulnessNumerator       int64
HelpfulnessDenominator     int64
Score                      int64
Time                       int64
Summary                   object
Text                      object
dtype: object

In [11]:
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

In [12]:
filtered_data.loc[:,'Score'].apply(partition).head()

0    positive
1    negative
2    positive
3    negative
4    positive
Name: Score, dtype: object

In [13]:
#changing reviews with score less than 3 to be positive and vice-versa
positiveNegative = filtered_data.loc[:,'Score'].apply(partition)
filtered_data['Score'] = positiveNegative

In [14]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [15]:
filtered_data.shape #looking at the number of attributes and size of the data

(525814, 10)

In [16]:
sorted_dataframe = filtered_data.sort_values(by=['UserId','Time'],axis=0,ascending=[True,True],inplace=False)

In [17]:
sorted_dataframe.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
125648,136323,B006Q820X0,#oc-R103C0QSV1DF5E,C,1,2,positive,1343088000,Great for the Price,I have to say I was a little apprehensive to b...
477217,516062,B008I1XPKA,#oc-R109MU5OBBZ59U,AayGee,0,1,positive,1350086400,AWESOME Coffee!!!!,Received my free K cups as a sample promotion ...
477233,516079,B008I1XPKA,#oc-R10LFEMQEW6QGZ,Julie,0,1,positive,1345939200,Brooklyn Bean Roastery Breakfast Blend K-Cups,Brooklyn Bean Roastery Blend K-Cups are great ...
125844,136545,B006Q820X0,#oc-R10UA029WVWIUI,Kim D,0,0,negative,1342483200,Less than satisfactory. I gave the Brooklyn K...,"Brooklyn ""French Roast"" K-Cup Coffee is not on..."
76583,83318,B005ZBZLT4,#oc-R115TNMSPFT9I7,Breyton,2,3,negative,1331510400,"""Green"" K-cup packaging sacrifices flavor",Overall its just OK when considering the price...


In [18]:
# drop duplicates

In [19]:
sorted_dataframe.duplicated(subset={'UserId','Time'},keep="first").tail(20)

441919    False
406441    False
118191    False
131735     True
199890    False
213538    False
272167     True
373695     True
334124    False
382989     True
339876    False
449898    False
227040    False
288431    False
312651    False
391066     True
175844    False
205607    False
121277    False
169538    False
dtype: bool

In [20]:
sorted_dataframe.duplicated(subset={'UserId','Time','Text'},keep="first").tail(20)

441919    False
406441    False
118191    False
131735     True
199890    False
213538    False
272167     True
373695     True
334124    False
382989     True
339876    False
449898    False
227040    False
288431    False
312651    False
391066    False
175844    False
205607    False
121277    False
169538    False
dtype: bool

In [21]:
sum(sorted_dataframe.duplicated(subset={'UserId','Time'},keep="first"))

197082

In [22]:
sum(sorted_dataframe.duplicated(subset={'UserId','Time','Text'},keep='first'))

161681

In [23]:
sum(filtered_data.duplicated(subset={'UserId','Time'},keep="first"))

197082

In [24]:
sum(sorted_dataframe.duplicated(subset={'UserId','Time','Text'},keep='first'))

161681

In [25]:
# So it means whether you sort the dataframe or not, it would show up the duplicates in the same way and 
# same in number.

In [26]:
# Now I would try to find out those rows where UserId and Time is same but Text differs in that. 

In [27]:
merged_dataframe = pd.merge(filtered_data,filtered_data,on=['UserId','Time'],how='inner')
# I did this type of merging to get the results, for which i did operation in next cell.

In [28]:
merged_dataframe[merged_dataframe['Text_x'] != merged_dataframe['Text_y']].head()
# these are those results where User reviewed at least 2 products at the same Time but Text is different

Unnamed: 0,Id_x,ProductId_x,UserId,ProfileName_x,HelpfulnessNumerator_x,HelpfulnessDenominator_x,Score_x,Time,Summary_x,Text_x,Id_y,ProductId_y,ProfileName_y,HelpfulnessNumerator_y,HelpfulnessDenominator_y,Score_y,Summary_y,Text_y
18,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,positive,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...,401972,B0006349W6,Carol A. Reed,0,0,positive,Good Training Treat,My dog will come in from outside when I am tra...
19,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,positive,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...,402156,B0006349WQ,Carol A. Reed,0,0,positive,Good Training Treat,My dog will come in from outside when I am tra...
20,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,positive,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...,402201,B0006349WG,Carol A. Reed,0,0,positive,Good Training Treat,My dog will come in from outside when I am tra...
21,401972,B0006349W6,A21BT40VZCCYT4,Carol A. Reed,0,0,positive,1351209600,Good Training Treat,My dog will come in from outside when I am tra...,10,B00171APVA,Carol A. Reed,0,0,positive,Healthy Dog Food,This is a very healthy dog food. Good for thei...
25,402156,B0006349WQ,A21BT40VZCCYT4,Carol A. Reed,0,0,positive,1351209600,Good Training Treat,My dog will come in from outside when I am tra...,10,B00171APVA,Carol A. Reed,0,0,positive,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [29]:
merged_dataframe[merged_dataframe['Text_x'] == merged_dataframe['Text_y']].head()
# these are those results where User reviewed at least 2 products at the same Time and Text is also same

Unnamed: 0,Id_x,ProductId_x,UserId,ProfileName_x,HelpfulnessNumerator_x,HelpfulnessDenominator_x,Score_x,Time,Summary_x,Text_x,Id_y,ProductId_y,ProfileName_y,HelpfulnessNumerator_y,HelpfulnessDenominator_y,Score_y,Summary_y,Text_y
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1,B001E4KFG0,delmartian,1,1,positive,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2,B00813GRG4,dll pa,0,0,negative,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,3,B000LQOCH0,"Natalia Corres ""Natalia Corres""",1,1,positive,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,4,B000UA0QIQ,Karl,3,3,negative,Cough Medicine,If you are looking for the secret ingredient i...
4,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,136304,B002Y7526Y,Karl,3,3,negative,Cough Medicine,If you are looking for the secret ingredient i...


#  Exploratory Data Analysis

## [7.1.2] Data Cleaning: Deduplication

In [30]:
#Deduplication of entries
final=filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [31]:
final.size

3641730

In [32]:
# in order to check how much percentage of data remained
(final['Id'].size/filtered_data['Id'].size)*100

69.25890143662969

In [33]:
# some more duplication

In [34]:
final = final[final['HelpfulnessNumerator']<=final['HelpfulnessDenominator']]

In [35]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

In [36]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## 7.2.3  Text Preprocessing: Stemming, stop-word removal and Lemmatization.

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative reviews

In [37]:
final['Text'].values # here we are converting Series to Numpy array

array([ 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
   

In [38]:
import re
# find sentences containing HTML tags
i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;

10
I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service!


In [39]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to
[nltk_data]    |     /home/jovyan/n

[nltk_data]    |   Unzipping corpora/sinica_treebank.zip.
[nltk_data]    | Downloading package smultron to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/smultron.zip.
[nltk_data]    | Downloading package state_union to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/state_union.zip.
[nltk_data]    | Downloading package stopwords to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/stopwords.zip.
[nltk_data]    | Downloading package subjectivity to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/subjectivity.zip.
[nltk_data]    | Downloading package swadesh to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/swadesh.zip.
[nltk_data]    | Downloading package switchboard to
[nltk_data]    |     /home/jovyan/nltk_data...
[nltk_data]    |   Unzipping corpora/switchboard.zip.
[nltk_data]    | Downloading package tim

True

In [40]:
# in order to test
print(re.findall(r'[,|/|\'|\\|.]',"Hi, I am \your/ friends' friend."))
print(re.findall(r'[,/\'\\.]',"Hi, I am \your/ friends' friend."))
# so in a pattern having | or not having |, inside [] --> is same thing

print(re.sub(r'[,/\'\\.]',r'',"Hi, I am \your/ friends' friend."))
print(re.sub(r'[,|/|\'|\\|.]',r'',"Hi, I am \your/ friends' friend."))

[',', '\\', '/', "'", '.']
[',', '\\', '/', "'", '.']
Hi I am your friends friend
Hi I am your friends friend


In [41]:
print(re.findall(r'<.*?>','Hi<br>, i am your friend</br>. What\'s plan<span> for today</span>.'))
print(re.findall(r'<.*>','Hi<br>, i am your friend</br>. What\'s plan<span> for today</span>.'))
#   https://stackoverflow.com/questions/16771177/what-does-the-regex-mean

['<br>', '</br>', '<span>', '</span>']
["<br>, i am your friend</br>. What's plan<span> for today</span>"]


In [42]:
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = cleanr.sub(' ', sentence)
    return cleantext

def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\\|/]',r' ',cleaned)
    return  cleaned

# removing not from stop words
for ele in ['not','very']:
    stop.remove(ele)


print(stop)
print('************************************')
print(sno.stem('tasty'))

{'and', 'their', 'had', 'theirs', 'which', 'against', 'where', 'her', "wasn't", 'about', 'ourselves', 'only', "you've", 'yourself', 'from', 'he', 'too', 'should', 'a', "mustn't", 'here', 'how', 'during', 'don', 'now', 'itself', 'down', 'd', 'weren', 'll', 'for', 'aren', 'hers', 'she', 'herself', 'other', 'me', 'wouldn', 'in', 'we', 'will', 'just', 'am', 'such', 'its', "shouldn't", 'between', 'isn', "hadn't", 'be', 'why', 'ours', 'there', 'who', "don't", 'his', 'or', 'when', 'than', 'both', 'to', 'needn', 'if', 's', "didn't", 'out', 'didn', 'of', 'i', 'after', 'off', 'o', 'are', 'myself', 'through', 'wasn', 'couldn', 'it', "that'll", "should've", "doesn't", 'no', 'as', 'what', 'but', 'does', 'the', 'an', 'our', 'being', 'before', 'did', "won't", 'was', 'below', 'all', 'shan', 'won', 'whom', 'those', 'were', "it's", 'y', "she's", 'him', 'over', 'have', 'can', 'ain', 'further', 'haven', 'your', 'mightn', "you're", "hasn't", 'most', 'hasn', 'my', 'again', "you'll", 'once', "wouldn't", 've'

In [43]:
print(re.split(r'\s+','Hi    whats  up')) # this is better than below process
print('Hi .   Whats .  up'.split(' '))

['Hi', 'whats', 'up']
['Hi', '.', '', '', 'Whats', '.', '', 'up']


In [44]:
utfstr = "ボールト"
b = utfstr.encode()
print(b)
print(b.decode())
print('᧿'.encode())

# since characters can come from any language, so encoding of those is important to convert them to the 
# unicode format(utf-8) i.e common format

b'\xe3\x83\x9c\xe3\x83\xbc\xe3\x83\xab\xe3\x83\x88'
ボールト
b'\xe1\xa7\xbf'


In [45]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.

i = 0
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.

for sentence in final['Text'].values:
        cleaned_words = []
        cleaned_sentence = cleanpunc(cleanhtml(sentence))
        for word in re.split(r'\s+',cleaned_sentence):
            word = word.lower()
            if word.isalpha() and (len(word)>2) and (word not in stop):
                stemmed_word = sno.stem(word).encode('utf-8')
                cleaned_words.append(stemmed_word)
                if (final['Score'].values)[i] == 'positive':
                    all_positive_words.append(stemmed_word) #list of all words used to describe positive reviews
                elif (final['Score'].values)[i] == 'negative':
                    all_negative_words.append(stemmed_word)#list of all words used to describe negative reviews reviews
            else:
                continue
            
        final_string.append(b" ".join(cleaned_words))#byte must be there since we converted everything to utf-8        
        i += 1

In [46]:
final_string[0:5]

[b'bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better',
 b'product arriv label jumbo salt peanut peanut actual small size unsalt not sure error vendor intend repres product jumbo',
 b'confect around centuri light pillowi citrus gelatin nut case filbert cut tini squar liber coat powder sugar tini mouth heaven not chewi veri flavor high recommend yummi treat familiar stori lewi lion witch wardrob treat seduc edmund sell brother sister witch',
 b'look secret ingredi robitussin believ found got addit root beer extract order good made cherri soda flavor veri medicin',
 b'great taffi great price wide assort yummi taffi deliveri veri quick taffi lover deal']

In [47]:
final['CleanedText']=final_string 
#adding a column of CleanedText which displays the data after pre-processing of the review 

In [48]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,b'bought sever vital can dog food product foun...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,b'product arriv label jumbo salt peanut peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,b'confect around centuri light pillowi citrus ...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,b'look secret ingredi robitussin believ found ...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,b'great taffi great price wide assort yummi ta...


In [49]:
final.head(3) #below the processed review can be seen in the CleanedText Column 

# store final table into an SQlLite table for future.
conn = sqlite3.connect('final.sqlite')
conn.text_factory = str
final.to_sql('Reviews', conn, flavor=None, schema=None, if_exists='replace', index=True, index_label=None,\
             chunksize=None, dtype=None)

In [50]:
#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
final_data = pd.read_sql_query("""
SELECT *
FROM Reviews
""", conn)

In [51]:
final_data.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,b'bought sever vital can dog food product foun...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,b'product arriv label jumbo salt peanut peanut...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,b'confect around centuri light pillowi citrus ...
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,b'look secret ingredi robitussin believ found ...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,b'great taffi great price wide assort yummi ta...


In [52]:
final_few = final.head(100)

In [53]:
final_few.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,b'bought sever vital can dog food product foun...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,b'product arriv label jumbo salt peanut peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,b'confect around centuri light pillowi citrus ...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,b'look secret ingredi robitussin believ found ...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,b'great taffi great price wide assort yummi ta...


In [54]:
import re
# find sentences containing HTML tags
i=0;
for sent in final['CleanedText'].values:
    if (len(re.findall('<.*?>', sent.decode()))):
        print(i)
        print(sent)
        break;
    i += 1;

In [55]:
# No output, it means html tags have been removed. :)

# [7.2.2] Bag of Words (BoW)

In [56]:
#BoW (Bag of Words)
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final_few['CleanedText'].values)

In [57]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [58]:
final_counts.shape

(100, 1151)

In [59]:
final_counts.get_shape()

(100, 1151)

In [60]:
dense_matrix = final_counts.todense()

In [61]:
import numpy
numpy.set_printoptions(threshold=numpy.nan)

In [120]:
dense_matrix.shape

(100, 1151)

In [121]:
type(dense_matrix)

numpy.matrixlib.defmatrix.matrix

In [122]:
dense_matrix[0:2]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 0, 0, 0

## [7.2.4] Bi-Grams and n-Grams.

**Motivation**

Now that we have our list of words describing positive and negative reviews lets analyse them.<br>

We begin analysis by getting the frequency distribution of the words as shown below

In [63]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ",freq_dist_positive.most_common(20))
print("Most Common Negative Words : ",freq_dist_negative.most_common(20))

Most Common Positive Words :  [(b'not', 146798), (b'like', 139426), (b'tast', 129045), (b'good', 112766), (b'flavor', 109629), (b'love', 107357), (b'use', 103886), (b'great', 103871), (b'one', 96723), (b'product', 91033), (b'veri', 90838), (b'tri', 86790), (b'tea', 83893), (b'coffe', 78813), (b'make', 75107), (b'get', 72124), (b'food', 64803), (b'would', 55566), (b'time', 55264), (b'buy', 54198)]
Most Common Negative Words :  [(b'not', 54377), (b'tast', 34587), (b'like', 32333), (b'product', 28218), (b'one', 20572), (b'flavor', 19571), (b'would', 17974), (b'tri', 17754), (b'veri', 17011), (b'use', 15304), (b'good', 15041), (b'coffe', 14717), (b'get', 13787), (b'buy', 13752), (b'order', 12871), (b'food', 12753), (b'dont', 11877), (b'tea', 11660), (b'even', 11088), (b'box', 10843)]


In [64]:
#bi-gram, tri-gram and n-gram

#removing stop words like "not" should be avoided before building n-grams
count_vect = CountVectorizer(ngram_range=(1,3) ) #in scikit-learn
final_bigram_counts = count_vect.fit_transform(final_few['CleanedText'].values)


In [116]:
count_vect.get_feature_names()

['abbi',
 'abdomin',
 'abdomin cramp',
 'abdomin cramp symptom',
 'abl',
 'abl buy',
 'abl buy right',
 'abl save',
 'abl save almost',
 'abl tell',
 'abl tell sure',
 'absenc',
 'absenc especi',
 'absenc especi colleagu',
 'absolut',
 'absolut scrumptuous',
 'absolut scrumptuous husband',
 'accept',
 'accept howev',
 'accept howev compani',
 'accord',
 'accord guin',
 'accord guin book',
 'account',
 'account your',
 'account your actual',
 'across',
 'across amazon',
 'across amazon one',
 'across bike',
 'across bike shop',
 'activ',
 'activ perspir',
 'activ perspir gland',
 'actual',
 'actual cane',
 'actual cane sugar',
 'actual get',
 'actual get bang',
 'actual like',
 'actual like slight',
 'actual small',
 'actual small size',
 'actual tell',
 'actual tell three',
 'ad',
 'ad appl',
 'ad appl cinnamon',
 'ad boil',
 'ad boil water',
 'add',
 'add fact',
 'add fact get',
 'add flavor',
 'add flavor also',
 'add microwav',
 'add microwav oatmeal',
 'add milk',
 'add milk top',


In [65]:
type(final_bigram_counts) # this is sparse matrix representation

scipy.sparse.csr.csr_matrix

In [66]:
final_bigram_counts.get_shape()

(100, 7241)

# [7.2.5] TF-IDF

In [67]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final_few['CleanedText'].values)

In [68]:
type(final_tf_idf)

scipy.sparse.csr.csr_matrix

In [69]:
final_tf_idf

<100x4136 sparse matrix of type '<class 'numpy.float64'>'
	with 6001 stored elements in Compressed Sparse Row format>

In [70]:
final_tf_idf.shape

(100, 4136)

In [71]:
features = tf_idf_vect.get_feature_names()

In [72]:
features[0:10]

['abbi',
 'abdomin',
 'abdomin cramp',
 'abl',
 'abl buy',
 'abl save',
 'abl tell',
 'absenc',
 'absenc especi',
 'absolut']

In [73]:
type(tf_idf_vect.get_feature_names())

list

In [74]:
len(tf_idf_vect.get_feature_names()) # this is same as columns of csr matrix

4136

In [75]:
final_tf_idf[3,:]

<1x4136 sparse matrix of type '<class 'numpy.float64'>'
	with 37 stored elements in Compressed Sparse Row format>

In [76]:
type(final_tf_idf[3,:])

scipy.sparse.csr.csr_matrix

In [77]:
final_tf_idf[3,:].shape

(1, 4136)

In [78]:
type(final_tf_idf[3,:].toarray())

numpy.ndarray

In [79]:
final_tf_idf[3,:].toarray().shape
# list of list

(1, 4136)

In [80]:
type(final_tf_idf[3,:].toarray()[0])

numpy.ndarray

In [81]:
final_tf_idf[3,:].toarray()[0].shape

(4136,)

In [82]:
# covnert a row in saprsematrix to a numpy array
print(final_tf_idf[3,:].toarray()[0]) 


[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.15660796  0.
  0.18227759  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.       

In [83]:
np.argsort(np.array([4,2,1,3]))

array([2, 1, 3, 0])

In [84]:
np.argsort(np.array([4,2,1,3]))[::-1]

array([0, 3, 1, 2])

In [85]:
# source: https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)

# [7.2.6] Word2Vec

In [93]:
# Train your own Word2Vec model using your own text corpus
# since i have done the text preprocessing already, hence no need here 

import gensim
list_of_sent=[]
for sent in final_few['CleanedText'].values:
    filtered_sentence=[]
    for w in re.split(r'\s+',sent.decode()):
        filtered_sentence.append(w) 
    list_of_sent.append(filtered_sentence)

In [97]:
print(final_few['CleanedText'].values[0:2])
print("*****************************************************************")
print(list_of_sent[0:2])

[ b'bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better'
 b'product arriv label jumbo salt peanut peanut actual small size unsalt not sure error vendor intend repres product jumbo']
*****************************************************************
[['bought', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'found', 'good', 'qualiti', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', 'labrador', 'finicki', 'appreci', 'product', 'better'], ['product', 'arriv', 'label', 'jumbo', 'salt', 'peanut', 'peanut', 'actual', 'small', 'size', 'unsalt', 'not', 'sure', 'error', 'vendor', 'intend', 'repres', 'product', 'jumbo']]


In [98]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=20)

In [99]:
w2v_model.wv.vocab
# so it is a dictionary where keys are words and values are corresponding 50 dimentional vector.

{'bought': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c2b0>,
 'sever': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c1d0>,
 'dog': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c160>,
 'food': <gensim.models.keyedvectors.Vocab at 0x3ffecd07cac8>,
 'product': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c198>,
 'found': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c358>,
 'good': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c588>,
 'qualiti': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c080>,
 'look': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c0b8>,
 'like': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c518>,
 'better': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c240>,
 'arriv': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c4e0>,
 'peanut': <gensim.models.keyedvectors.Vocab at 0x3ffecd07c390>,
 'actual': <gensim.models.keyedvectors.Vocab at 0x3ffecc60fba8>,
 'small': <gensim.models.keyedvectors.Vocab at 0x3ffecc60fb70>,
 'size': <gensim.models.keyedvectors.V

In [103]:
w2v_model.wv.vocab.keys()

dict_keys(['bought', 'sever', 'dog', 'food', 'product', 'found', 'good', 'qualiti', 'look', 'like', 'better', 'arriv', 'peanut', 'actual', 'small', 'size', 'not', 'sure', 'powder', 'sugar', 'veri', 'flavor', 'high', 'recommend', 'got', 'order', 'made', 'great', 'taffi', 'price', 'pound', 'bag', 'enjoy', 'mani', 'grape', 'bit', 'much', 'husband', 'last', 'two', 'would', 'brand', 'soft', 'candi', 'well', 'expens', 'love', 'buy', 'right', 'eat', 'also', 'amount', 'everi', 'dont', 'know', 'hot', 'sauc', 'make', 'one', 'bottl', 'back', 'home', 'find', 'realli', 'want', 'tast', 'never', 'use', 'thank', 'need', 'go', 'year', 'new', 'differ', 'tri', 'first', 'bowl', 'full', 'ive', 'came', 'pack', 'fresh', 'delici', 'twizzler', 'strawberri', 'six', 'sweet', 'take', 'time', 'purchas', 'share', 'appl', 'dri', 'ever', 'get', 'store', 'perfect', 'amazon', 'alway', 'tasti', 'still', 'wont', 'say', 'hard', 'free', 'chocol', 'ill', 'excel', 'coffe', 'littl', 'easi', 'prepar', 'less', 'minut', 'water',

In [104]:
len(w2v_model.wv.vocab.keys())

173

In [100]:
words = list(w2v_model.wv.vocab)
words

['bought',
 'sever',
 'dog',
 'food',
 'product',
 'found',
 'good',
 'qualiti',
 'look',
 'like',
 'better',
 'arriv',
 'peanut',
 'actual',
 'small',
 'size',
 'not',
 'sure',
 'powder',
 'sugar',
 'veri',
 'flavor',
 'high',
 'recommend',
 'got',
 'order',
 'made',
 'great',
 'taffi',
 'price',
 'pound',
 'bag',
 'enjoy',
 'mani',
 'grape',
 'bit',
 'much',
 'husband',
 'last',
 'two',
 'would',
 'brand',
 'soft',
 'candi',
 'well',
 'expens',
 'love',
 'buy',
 'right',
 'eat',
 'also',
 'amount',
 'everi',
 'dont',
 'know',
 'hot',
 'sauc',
 'make',
 'one',
 'bottl',
 'back',
 'home',
 'find',
 'realli',
 'want',
 'tast',
 'never',
 'use',
 'thank',
 'need',
 'go',
 'year',
 'new',
 'differ',
 'tri',
 'first',
 'bowl',
 'full',
 'ive',
 'came',
 'pack',
 'fresh',
 'delici',
 'twizzler',
 'strawberri',
 'six',
 'sweet',
 'take',
 'time',
 'purchas',
 'share',
 'appl',
 'dri',
 'ever',
 'get',
 'store',
 'perfect',
 'amazon',
 'alway',
 'tasti',
 'still',
 'wont',
 'say',
 'hard',
 '

In [105]:
print(len(words))

173


In [108]:
w2v_model.wv.most_similar('tast')

[('food', 0.5709247589111328),
 ('good', 0.5328111052513123),
 ('use', 0.4848178029060364),
 ('cramp', 0.4786970317363739),
 ('better', 0.47707125544548035),
 ('order', 0.4611034095287323),
 ('made', 0.44186699390411377),
 ('box', 0.4392208456993103),
 ('day', 0.4319041967391968),
 ('back', 0.419863760471344)]

In [109]:
w2v_model.wv.most_similar('like')

[('ill', 0.5896182656288147),
 ('realli', 0.5076995491981506),
 ('milk', 0.5058864951133728),
 ('littl', 0.5003845691680908),
 ('fact', 0.4969714283943176),
 ('make', 0.49423909187316895),
 ('get', 0.4739544093608856),
 ('alway', 0.4531171917915344),
 ('allergi', 0.45302557945251465),
 ('food', 0.44719216227531433)]

In [112]:
w2v_model.wv['like']

array([  4.96302173e-03,  -1.60947852e-02,  -5.76658174e-03,
        -2.56135664e-03,   4.33374383e-03,  -1.33428695e-02,
         2.24790443e-03,   1.22242924e-02,   3.86054016e-04,
        -7.26728979e-03,  -5.41890087e-03,  -7.88634527e-04,
         8.37130751e-03,  -1.56237604e-07,   1.67105848e-03,
         6.11446518e-03,   1.00968080e-02,  -2.19820859e-03,
        -3.21818865e-03,  -6.89471746e-03,   1.16849330e-03,
         8.04630574e-03,   3.41490493e-03,  -7.11216184e-04,
        -1.62737258e-03,  -2.31684675e-03,   6.17465889e-03,
         6.46564690e-03,   5.35779865e-04,   1.24602439e-02,
         7.95503613e-03,   9.43786651e-03,   5.93123725e-03,
        -1.18331471e-03,  -1.06092368e-03,  -6.55614398e-03,
        -6.77660014e-03,   1.38096418e-03,   2.66746472e-04,
         3.24396708e-04,   1.64484903e-02,   1.51724536e-02,
        -1.92939176e-03,   5.05620800e-03,  -8.44767224e-03,
        -1.22896964e-02,   1.04833636e-02,   7.59449974e-03,
         6.00809837e-03,

In [114]:
w2v_model.wv.similarity('tasti', 'tast')

0.20409823806953387

In [118]:
# see this from above BOW/Bigram part
count_vect_feat = count_vect.get_feature_names() # list of words in the BoW
print(count_vect_feat.index('like'))
print(count_vect_feat[3635])

3635
like


# [7.2.7] Avg W2V, TFIDF-W2V

In [149]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100
50


In [129]:
type(sent_vectors[0])

numpy.ndarray

In [126]:
type(sent_vectors)

list

In [125]:
sent_vectors
# it is a list of numpy arrrays

[array([  5.19458174e-03,  -7.05407043e-03,   2.18238129e-04,
          4.84800021e-03,   3.82162756e-03,  -1.46473371e-03,
          3.47717395e-03,   5.42932081e-03,  -3.20060354e-03,
         -4.52842237e-03,   4.92138637e-03,   5.30925986e-03,
          3.04800156e-03,   4.82954462e-04,  -2.87104099e-03,
          1.69420136e-03,   3.17163831e-03,   6.19283857e-04,
         -2.47537578e-03,   1.38386830e-03,  -3.85118931e-05,
         -3.02817296e-03,  -1.75559541e-03,   2.54800366e-03,
         -1.93972195e-03,  -6.05809580e-04,  -4.98646679e-04,
         -3.00082525e-03,  -3.73364326e-04,   6.27686196e-03,
          8.42018544e-04,   3.96647525e-03,   2.07613270e-04,
          3.51998961e-03,   5.95706658e-04,  -3.18829530e-03,
         -2.81280211e-03,  -1.15300004e-03,   7.26516101e-03,
          5.78648195e-04,   8.38754444e-03,   3.97511310e-03,
         -7.46903118e-04,  -2.89209815e-03,  -2.61476167e-03,
         -1.85519288e-03,   1.72972187e-03,  -2.76588319e-04,
        

In [152]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try: 
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tf_idf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            continue
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

In [153]:
tfidf_sent_vectors

[array([  5.01238328e-03,  -7.72013876e-03,  -5.30832770e-04,
          4.20309208e-03,   3.30017182e-03,  -9.09302709e-06,
          4.98050260e-03,   5.40195633e-03,  -3.27678460e-03,
         -5.82008389e-03,   5.07931226e-03,   5.10627806e-03,
          9.82008410e-04,  -5.87911319e-04,  -2.29331942e-03,
          3.58928852e-03,   9.98499814e-04,   1.91625305e-03,
         -2.34399117e-03,   2.85095165e-03,   5.35105880e-04,
         -3.03226403e-03,  -1.56946075e-03,   3.92269205e-03,
         -2.45770005e-03,   1.17646419e-03,   2.45275735e-04,
         -2.75192499e-03,  -1.70320584e-03,   6.40654296e-03,
          1.60709787e-04,   5.43086219e-03,   7.42009795e-04,
          3.11459266e-03,   1.21110089e-03,  -3.56133367e-03,
         -3.89817555e-03,  -1.47472928e-03,   8.26490609e-03,
          6.05111942e-04,   9.27490541e-03,   2.28814100e-03,
         -1.01581574e-03,  -3.68299014e-03,  -2.83868437e-03,
         -8.82527227e-04,   1.25134375e-03,  -3.58092083e-04,
        

In [156]:
len(tfidf_sent_vectors)

100

In [157]:
print(len(tfidf_sent_vectors[0]))

50


In [None]:
# if you want to save the dataframe in csv then few values might be null in dataframe which could be nan in
# csv. So drop those rows which are having nan values.