# Loading data

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# using the SQLite Table to read data.
con = sqlite3.connect('/content/drive/MyDrive/ML/database.sqlite') 
#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con) 


In [4]:
# drop duplicate rows
data1 = filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first')

# drop rows that do not meet the condition
data1 = data1[data1['HelpfulnessNumerator'] <= data1['HelpfulnessDenominator']]

In [5]:
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Label'] = positiveNegative
print("Number of data points in our data", filtered_data.shape)
filtered_data.head(3)

Number of data points in our data (525814, 11)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Label
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1


In [6]:
# count score values
filtered_data['Score'].value_counts()

5    363122
4     80655
1     52268
2     29769
Name: Score, dtype: int64

In [7]:
# Randomely select 20000 samples from each'Score' 1,2,4,5 
S1 = filtered_data[filtered_data['Score'] ==1].sample(n=2500,random_state=0)
S2 = filtered_data[filtered_data['Score'] ==2].sample(n=2500,random_state=0)
S4 = filtered_data[filtered_data['Score'] ==4].sample(n=2500,random_state=0)
S5 = filtered_data[filtered_data['Score'] ==5].sample(n=2500,random_state=0)
data2 = pd.concat([S1,S2,S4,S5])
data2.shape
data2.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Label
234216,254114,B0047726E0,AGZNKD9JJ0BYY,lan,0,1,1,1318377600,cancelled but sent,I cancelled this item along with another coffe...,0
438705,474410,B003SBZC1U,A17A6KEW3OF239,William Fulkerson,31,35,1,1314748800,Not Natural at all!!,I bought three cases of this stuff. It is all ...,0
844,915,B000ER6YO0,AB0BXP1IKDBIA,TreGemellini,1,1,1,1278892800,Runny and odd-tasting,"My triplets will not eat this, eventhe one who...",0
428370,463254,B001FA1AWG,A140GGDL7VAUTL,J. Roper,0,2,1,1298937600,How anyone could eat this is beyond me,The chocolate that they use on these things ta...,0
457270,494421,B003ZVG4WY,A3B4NB57O7J6IY,beth,0,0,1,1349222400,don't bother,i'll start of by saying that i LOVE the Golden...,0


# [1] Text Preprocessing
## [1.1] Data Cleaning: Deduplication

In [8]:
#Sorting data according to ProductId in ascending order
final=data2.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

## [1.2] Stemming, stop-word removal and Lemmatization.

In [9]:
# find sentences containing HTML tags
import re
i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;

3
If I could rate this fly trap lower than one star, I would.  I think flies have come from miles away just to come in and laugh at this thing.  I'd have more success taking the flies into a vat of scalding water than getting a fly to randomly run into this box of ridiculousness.<br />WASTE OF $$!


In [10]:

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('tasty'))

{'re', 'ma', 'd', "shan't", 'nor', 'its', 'or', 'has', 'was', 'any', 'm', 'you', 'too', 'above', 'herself', 'and', 'those', 'aren', 'why', 'on', 'being', 'll', 'a', "won't", 'himself', 'does', 'my', 'do', 'she', "you're", 'both', 'don', 'more', "you'd", 'ours', 'wasn', 'them', "wouldn't", 'hadn', "don't", 'that', 'weren', 'during', 'at', "couldn't", "hasn't", 'few', 'can', 'about', 'from', 'how', 'hasn', 'shouldn', 'shan', "weren't", 'me', 'did', "wasn't", 'theirs', 'off', "should've", 'been', 'each', 'needn', 'under', 've', 'won', 'now', 'haven', 'we', 'ourselves', 'over', 'isn', 'most', 'no', 'should', 'in', 'whom', 'having', 'doesn', 'as', 'below', 'him', 'of', 'then', 'while', 'ain', 't', 'same', 'against', "aren't", 'be', 'between', 'again', 'by', 'o', 'some', 'he', 'her', 'when', 'here', "mustn't", 'such', 'before', 'yourselves', 'just', 'who', 'our', 'after', 'but', 'have', 'there', "hadn't", "doesn't", 'because', 'are', "you'll", 'into', "it's", 'themselves', "mightn't", 'might

In [11]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase

if not os.path.isfile('final.sqlite'):
    i=0
    str1=' '
    final_string=[]
    all_positive_words=[] # store words from +ve reviews here
    all_negative_words=[] # store words from -ve reviews here.
    s=''
    for sent in tqdm(final['Text'].values):
        filtered_sentence=[]
        #print(sent);
        sent=cleanhtml(sent) # remove HTMl tags
        for w in sent.split():
            for cleaned_words in cleanpunc(w).split():
                if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                    if(cleaned_words.lower() not in stop):
                        s=(sno.stem(cleaned_words.lower())).encode('utf8')
                        filtered_sentence.append(s)
                        if (final['Score'].values)[i] == 'positive': 
                            all_positive_words.append(s) #list of all words used to describe positive reviews
                        if(final['Score'].values)[i] == 'negative':
                            all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                    else:
                        continue
                else:
                    continue 
        #print(filtered_sentence)
        str1 = b" ".join(filtered_sentence) #final string of cleaned words
        #print("***********************************************************************")

        final_string.append(str1)
        i+=1

    #############---- storing the data into .sqlite file ------########################
    final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 
    final['CleanedText']=final['CleanedText'].str.decode("utf-8")
        # store final table into an SQlLite table for future.
    conn = sqlite3.connect('final.sqlite')
    c=conn.cursor()
    conn.text_factory = str
    final.to_sql('Reviews', conn,  schema=None, if_exists='replace', \
                 index=True, index_label=None, chunksize=None, dtype=None)
    conn.close()
    
    
    with open('positive_words.pkl', 'wb') as f:
        pickle.dump(all_positive_words, f)
    with open('negitive_words.pkl', 'wb') as f:
        pickle.dump(all_negative_words, f)

100%|██████████| 10000/10000 [00:18<00:00, 544.41it/s]


In [12]:
if os.path.isfile('final.sqlite'):
    conn = sqlite3.connect('final.sqlite')
    final = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, conn)
    conn.close()
else:
    print("Please the above cell")

In [13]:
final.to_pickle("./amazon.pkl")

In [14]:
# read data from pickle file from previous stage
data = pd.read_pickle("./amazon.pkl")
data.shape

(10000, 13)

# [2] Sorting data based on time

In [15]:
# Random sampling
#df = final.take(np.random.permutation(len(final))[:10000])
#df.head(2)

In [16]:
df = data
df['Time'] = pd.to_datetime(df['Time'])
# Sort by time
data = df.sort_values(by='Time')

print(data.shape)
print(data['Score'].value_counts())

(10000, 13)
5    2500
4    2500
2    2500
1    2500
Name: Score, dtype: int64


# [3] Storing into train and test

In [17]:
data.head(5)

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Label,CleanedText
6,346041,374343,B00004CI84,A1B2IZU1JLZA6,Wes,19,23,1,1970-01-01 00:00:00.948240000,WARNING: CLAMSHELL EDITION IS EDITED TV VERSION,"I, myself always enjoyed this movie, it's very...",0,alway enjoy movi funni entertain didnt hesit p...
7,346053,374357,B00004CI84,A31RM5QU797HPJ,Drez,1,2,4,1970-01-01 00:00:01.024531200,"&quot;I'm the ghost with the most, babe&quot;",Simply put: Beetlejuice is the funniest comedy...,1,simpli beetlejuic funniest comedi kind sinc gh...
9,346040,374342,B00004CI84,A10L8O1ZMUIMR2,G. Kleinschmidt,61,79,2,1970-01-01 00:00:01.040947200,Great movie turned bad,"Just to let you know, this movie is one of my ...",0,let know movi one person favorit ghost movi sa...
52,388413,419994,B0000A0BS5,A238V1XTSK9NFE,Andrew Lynn,46,59,2,1970-01-01 00:00:01.064361600,Not actually for use in espresso machines,Few things I'd like to point out:<p>1. &quot;...,0,thing like point roast drink northern itali ma...
53,38889,42227,B0000A0BS8,A1IU7S4HCK1XK0,Joanna Daneman,5,5,4,1970-01-01 00:00:01.067644800,I gave up on other coffees,The grocery store has all kinds of &quot;gourm...,1,groceri store kind coffe laid one tri tast lik...


In [18]:
# Spliting into Train and test
#X_train, X_test, y_train, y_test = train_test_split(data['CleanedText'].values,data['Score'].values,test_size=0.3,shuffle=False)
X=data['CleanedText'].values
y_score = data['Score'].values
y_label = data['Label'].values

# [4] Bag of Words (BoW)

In [19]:
X=data['CleanedText'].values
y_score = data['Score'].values
y_label = data['Label'].values

In [20]:
#Bag of words
count_vect = CountVectorizer(max_features=1000, min_df=10) 

X_bow = count_vect.fit_transform(X)
#Normalize Data
X_bow = preprocessing.normalize(X_bow)
print("Train Data Size: ",X_bow.shape)


Train Data Size:  (10000, 1000)


In [21]:
from scipy.sparse import csr_matrix

df = pd.DataFrame(data=csr_matrix.todense(X_bow))
df.to_csv('data_BOW.csv', index=False)

# [5] TF-IDF

In [22]:
# Spliting into Train and test
X=data['CleanedText'].values
y_score = data['Score'].values
y_label = data['Label'].values

In [23]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=500, min_df=10) #Using bi-grams
X_tfidf = tfidf.fit_transform(X)
#Normalize Data
X_tfidf = preprocessing.normalize(X_tfidf)
print("Train Data Size: ",X_tfidf.shape)


Train Data Size:  (10000, 500)


In [24]:
data_tfidf = pd.DataFrame(X_tfidf)
data_tfidf['Score'] = y_score
data_tfidf['Label'] = y_label
data_tfidf

Unnamed: 0,0,Score,Label
0,"(0, 475)\t0.19076536226522964\n (0, 474)\t0...",1,0
1,"(0, 226)\t0.3916743389841021\n (0, 262)\t0....",4,1
2,"(0, 89)\t0.11714468575883168\n (0, 455)\t0....",2,0
3,"(0, 290)\t0.0423628920510666\n (0, 478)\t0....",2,0
4,"(0, 442)\t0.34726816820715223\n (0, 205)\t0...",4,1
...,...,...,...
9995,"(0, 282)\t0.3320056698143062\n (0, 88)\t0.4...",2,0
9996,"(0, 334)\t0.12382720220294353\n (0, 123)\t0...",5,1
9997,"(0, 287)\t0.3045800949770464\n (0, 494)\t0....",5,1
9998,"(0, 13)\t0.4241331868249202\n (0, 121)\t0.5...",1,0


In [25]:
# save the dataframe as a csv file
data_tfidf.to_csv("data_tfidf.csv")

# [6] Word2Vec

In [26]:
# Train your own Word2Vec model using your own text corpus
i=0
list_of_sent=[]
for sent in data['CleanedText'].values:
    list_of_sent.append(sent.split())

In [27]:
print(data['CleanedText'].values[0])
print("*****************************************************************")
print(list_of_sent[0])

alway enjoy movi funni entertain didnt hesit pick clamshel edit guess market plan make movi famili someth elimin strong profan element usual edit televis version warn want uncut version avoid clamshel edit
*****************************************************************
['alway', 'enjoy', 'movi', 'funni', 'entertain', 'didnt', 'hesit', 'pick', 'clamshel', 'edit', 'guess', 'market', 'plan', 'make', 'movi', 'famili', 'someth', 'elimin', 'strong', 'profan', 'element', 'usual', 'edit', 'televis', 'version', 'warn', 'want', 'uncut', 'version', 'avoid', 'clamshel', 'edit']


In [28]:
# min_count = 5 considers only words that occured atleast 5 times
w2v_model=Word2Vec(list_of_sent,min_count=5,size=50, workers=4)

In [29]:
w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times  4671
sample words  ['alway', 'enjoy', 'movi', 'funni', 'entertain', 'didnt', 'hesit', 'pick', 'edit', 'guess', 'market', 'plan', 'make', 'famili', 'someth', 'elimin', 'strong', 'element', 'usual', 'version', 'warn', 'want', 'avoid', 'simpli', 'kind', 'sinc', 'michael', 'play', 'titl', 'charact', 'ghost', 'like', 'mischief', 'call', 'coupl', 'baldwin', 'get', 'rid', 'peopl', 'live', 'hous', 'let', 'know', 'one', 'person', 'favorit', 'said', 'feel', 'need', 'tell']


# [7] Avg Word2Vec

In [30]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sent): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|██████████| 10000/10000 [00:10<00:00, 950.19it/s]

10000
50





In [31]:
# Spliting into Train and test
X_word2vec = sent_vectors
y_score = data['Score'].values
y_label = data['Label'].values
# X_train, X_test, y_train, y_test = train_test_split(sent_vectors,data['Score'].values,test_size=0.3,shuffle=False)

In [32]:
data_word2vec = pd.DataFrame(X_word2vec)
data_word2vec['Score'] = y_score
data_word2vec['Label'] = y_label
data_word2vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,Score,Label
0,0.044891,-0.256138,-0.598166,-0.039018,0.195092,-0.442484,0.580296,-0.245235,0.080043,0.049164,0.261092,0.405602,-0.134290,-0.185865,-0.005600,-0.172665,0.119234,0.076928,-0.435445,-0.089709,-0.663312,-0.258832,-0.322153,0.382087,0.539053,-0.167788,0.019849,0.067652,0.300960,0.241558,0.102649,-0.103725,0.499754,0.249976,-0.143729,0.556381,-0.193111,0.058652,-0.079847,0.054239,-0.212595,-0.019018,0.461213,0.284678,-0.446596,-0.007204,-0.059594,0.258854,-0.563844,-0.053044,1,0
1,0.152413,-0.113530,-0.445075,0.007054,0.064675,-0.500506,0.443998,-0.153978,-0.062823,0.037497,0.141807,0.361689,-0.139014,-0.220442,0.040109,-0.237165,0.041039,0.060067,-0.370917,0.026846,-0.495134,-0.337890,-0.327433,0.324653,0.419561,-0.094801,0.054318,0.043873,0.304693,0.104381,0.071967,-0.194673,0.585458,0.345897,-0.095424,0.559222,-0.217531,-0.040563,-0.090264,0.002842,-0.163664,-0.006462,0.521571,0.194156,-0.424933,0.019791,-0.051937,0.150371,-0.469673,-0.025651,4,1
2,0.061683,-0.100678,-0.551078,0.078619,0.145116,-0.697096,0.468621,-0.251489,0.042705,0.148208,0.074293,0.297637,-0.160052,-0.325402,0.029803,-0.172086,0.215017,0.181165,-0.470747,-0.028614,-0.673408,-0.259001,-0.367351,0.298833,0.494648,-0.208461,0.110738,0.049144,0.326414,0.067134,0.073732,-0.227355,0.529893,0.209998,-0.093570,0.681038,-0.273163,-0.045612,-0.108927,0.031248,-0.159813,-0.074102,0.559340,0.133942,-0.484988,-0.061727,-0.072371,0.208605,-0.602103,-0.113836,2,0
3,-0.112709,-0.761445,-0.987060,-0.162646,0.435036,0.038056,0.852981,-0.330765,0.077339,0.039278,0.315445,0.798171,0.393390,-0.370587,-0.129832,-0.260898,0.381345,0.080859,-0.545484,-0.434313,-0.563223,-0.261247,-0.397727,0.390598,0.626574,-0.335781,-0.004405,-0.032514,0.621462,0.278556,-0.037291,-0.013664,0.745157,0.312444,-0.406944,0.889659,0.081404,0.025808,-0.060079,0.156098,-0.216924,0.061572,0.283327,0.364249,-0.609897,-0.396770,-0.228655,0.111830,-0.594518,-0.525978,2,0
4,-0.344697,-0.734372,-1.073051,-0.189044,0.529297,0.109918,0.878116,-0.453972,0.246173,-0.026920,0.384824,0.676637,0.174974,-0.363531,-0.140483,-0.271777,0.188507,0.023731,-0.518481,-0.509702,-0.731762,-0.254866,-0.471286,0.451922,0.822379,-0.326034,-0.017168,0.013766,0.545127,0.399590,0.019720,0.021545,0.738233,0.503468,-0.285744,1.000909,0.114558,0.035093,0.059317,0.215474,-0.224057,0.083944,0.262109,0.474362,-0.641506,-0.416906,-0.117117,0.226551,-0.708950,-0.443153,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.270722,-0.401977,-1.017921,-0.040838,0.255802,-0.056724,0.775431,-0.421471,0.304188,0.243206,0.192108,0.385441,-0.554647,-0.491858,-0.005803,-0.202277,-0.039301,-0.085032,-0.723369,-0.384925,-0.899896,0.032006,-0.432142,0.433597,0.610779,-0.385966,0.124395,0.209685,0.438224,0.441406,0.089623,-0.120910,0.608923,0.130728,0.024581,0.673516,0.214184,0.119958,0.021610,0.093078,-0.005144,-0.192057,0.284768,0.567618,-0.502028,-0.280753,0.008234,0.141311,-0.877727,-0.242622,2,0
9996,-0.207984,-0.344058,-0.762161,-0.075227,0.341618,-0.494803,0.715904,-0.325467,0.085930,0.324591,0.367880,0.550536,-0.298424,-0.226291,-0.143375,-0.131556,0.017770,0.123542,-0.294731,-0.266422,-0.713243,-0.237671,-0.503137,0.381189,0.660882,-0.106112,0.147937,0.061379,0.135106,0.185254,0.076675,-0.063460,0.578228,0.359718,-0.385383,0.868523,-0.473685,0.065424,-0.176236,0.165420,-0.271446,-0.062899,0.577950,0.155501,-0.415488,-0.128720,-0.086415,0.499306,-0.657485,-0.264165,5,1
9997,-0.197664,-0.681960,-0.757087,-0.084210,0.665644,-0.374096,0.703344,-0.378064,0.061744,-0.011634,0.346844,0.596604,0.168136,-0.337475,-0.360935,0.049986,0.289282,0.385823,-0.650075,-0.152333,-1.056397,-0.296937,-0.280475,0.441174,0.587208,-0.534592,-0.296424,0.084420,0.536752,0.125372,-0.040285,0.172888,0.508525,0.302158,-0.250128,0.764951,-0.053838,-0.003005,-0.110832,0.187312,-0.207441,0.220868,0.390859,0.375785,-0.251807,-0.167964,-0.176038,0.340710,-0.655241,-0.289507,5,1
9998,0.416650,0.015591,-0.357376,-0.012044,0.000436,-0.676851,0.669726,-0.069594,-0.200938,0.058657,0.240161,0.379498,0.044396,-0.095545,-0.059229,-0.048393,0.099855,0.081008,-0.499969,0.143792,-0.455096,-0.405300,-0.575155,0.236903,0.414581,-0.204982,-0.018220,0.061277,0.527398,0.274069,-0.224803,-0.328006,0.680863,0.287812,-0.475617,0.765130,-0.268940,-0.071712,-0.400443,0.204975,-0.403461,0.056953,0.557636,-0.042403,-0.444944,-0.058923,-0.107094,0.249899,-0.515709,-0.227109,1,0


In [33]:
# save the dataframe as a csv file
data_word2vec.to_csv("data_word2vec.csv")

# [8] TF-IDF Word2Vec

In [34]:
model = TfidfVectorizer()
tf_idf_matrix = model.fit_transform(data['CleanedText'].values)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [35]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sent): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|██████████| 10000/10000 [00:15<00:00, 658.24it/s]


In [36]:
# Spliting into Train and test
X_tfidf_word2vec = tfidf_sent_vectors
y_score = data['Score'].values
y_label = data['Label'].values
#X_train, X_test, y_train, y_test = train_test_split(tfidf_sent_vectors, data['Score'].values, test_size=0.3, shuffle=False)

In [37]:
data_tfidf_word2vec = pd.DataFrame(X_tfidf_word2vec)
data_tfidf_word2vec['Score'] = y_score
data_tfidf_word2vec['Label'] = y_label
data_tfidf_word2vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,Score,Label
0,0.083159,-0.194632,-0.453659,-0.035444,0.174189,-0.381779,0.505880,-0.159757,0.035549,0.089590,0.238918,0.362034,-0.101119,-0.141308,-0.037564,-0.131101,0.080507,0.030553,-0.346248,-0.042610,-0.512646,-0.204641,-0.327454,0.315965,0.441291,-0.142503,0.048039,0.069096,0.225739,0.218030,0.025470,-0.100050,0.416521,0.177028,-0.197739,0.491731,-0.182386,0.025821,-0.137783,0.078272,-0.197740,-0.038066,0.381924,0.189625,-0.330671,-0.040524,-0.080685,0.266879,-0.467894,-0.094108,1,0
1,0.182530,-0.084354,-0.353159,0.009299,0.048890,-0.437712,0.386937,-0.098773,-0.058942,0.037930,0.129005,0.307724,-0.082574,-0.161125,0.026752,-0.170740,0.054280,0.033531,-0.317387,0.047027,-0.400568,-0.283178,-0.290259,0.270442,0.345292,-0.089674,0.031065,0.040394,0.250634,0.114687,0.038071,-0.180921,0.491093,0.250372,-0.118238,0.473697,-0.178963,-0.034015,-0.120572,0.028771,-0.156422,-0.027089,0.446578,0.126239,-0.351581,0.013410,-0.064434,0.163877,-0.407288,-0.058466,4,1
2,0.055931,-0.103146,-0.515731,0.065286,0.154119,-0.593493,0.412991,-0.244884,0.045804,0.157224,0.068207,0.277927,-0.167613,-0.310023,0.009620,-0.142680,0.199333,0.135882,-0.459699,-0.049237,-0.603663,-0.228993,-0.361754,0.296317,0.449939,-0.213134,0.089973,0.056595,0.314425,0.085024,0.047759,-0.221201,0.487626,0.189532,-0.106455,0.646187,-0.231834,-0.052792,-0.094456,0.054439,-0.145626,-0.096931,0.480236,0.125176,-0.418755,-0.080614,-0.055943,0.223159,-0.569929,-0.127378,2,0
3,-0.301735,-1.454538,-1.443953,-0.423058,0.745548,0.854333,1.137385,-0.320483,0.108481,-0.183206,0.496736,1.307802,1.130323,-0.390024,-0.237633,-0.260837,0.659045,0.003235,-0.582552,-0.860957,-0.331522,-0.346311,-0.405468,0.491110,0.771854,-0.423842,-0.190281,-0.157826,0.892140,0.412639,-0.132194,0.191795,1.071672,0.498969,-0.726253,1.155514,0.449098,0.033367,0.036988,0.366774,-0.288338,0.238161,0.141313,0.503574,-0.837724,-0.704257,-0.411111,0.052158,-0.543236,-1.124051,2,0
4,-0.360219,-0.905066,-1.144784,-0.225217,0.583697,0.332576,0.918250,-0.396582,0.250063,-0.107090,0.410182,0.798729,0.393564,-0.351672,-0.140062,-0.238309,0.264878,-0.009531,-0.538710,-0.608617,-0.651600,-0.256359,-0.431437,0.467313,0.812631,-0.353242,-0.079837,-0.018272,0.602352,0.441954,-0.005931,0.078273,0.799167,0.493931,-0.334667,1.003607,0.252370,0.055673,0.081734,0.274823,-0.238522,0.103669,0.246696,0.515018,-0.709276,-0.455902,-0.190197,0.204533,-0.687956,-0.605506,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.163092,-0.355032,-0.914873,-0.039714,0.231773,-0.043753,0.749274,-0.326036,0.258905,0.229230,0.170351,0.385586,-0.460938,-0.425561,-0.037664,-0.138333,-0.019238,-0.101709,-0.721374,-0.309469,-0.775802,-0.002793,-0.426320,0.387509,0.526209,-0.363301,0.091780,0.183316,0.442504,0.451305,0.023736,-0.129733,0.570134,0.081353,-0.053752,0.588969,0.193793,0.105418,-0.027110,0.124627,-0.020541,-0.206675,0.267586,0.473664,-0.430063,-0.264652,-0.022971,0.148273,-0.809627,-0.280319,2,0
9996,-0.341011,-0.513429,-0.964311,-0.189818,0.462490,-0.297201,0.722084,-0.377693,0.142995,0.440913,0.365508,0.659601,-0.394281,-0.228071,-0.271340,0.029441,0.020985,0.110402,-0.285605,-0.413718,-0.697065,-0.137035,-0.522460,0.393528,0.642025,-0.116644,0.088274,0.096630,0.082774,0.193708,0.051664,0.035999,0.610939,0.300733,-0.477410,0.858952,-0.484857,0.149402,-0.152221,0.262421,-0.236383,-0.094631,0.462599,0.073970,-0.259563,-0.178919,-0.007505,0.630865,-0.732386,-0.417839,5,1
9997,-0.203475,-0.620582,-0.754626,-0.068063,0.643990,-0.280575,0.672656,-0.311254,0.105479,-0.032455,0.307351,0.580371,0.103899,-0.346229,-0.305727,0.008889,0.229507,0.290814,-0.640825,-0.139449,-0.951683,-0.293946,-0.318863,0.424384,0.578558,-0.480542,-0.229807,0.095570,0.531431,0.164421,-0.038022,0.121791,0.536296,0.303890,-0.215264,0.723741,-0.022116,-0.018357,-0.059790,0.190892,-0.187125,0.169207,0.356784,0.349531,-0.256880,-0.182173,-0.146780,0.319651,-0.644908,-0.316173,5,1
9998,0.381873,0.042101,-0.332223,0.043093,0.046490,-0.562722,0.545775,-0.016921,-0.140354,0.078318,0.151185,0.343640,-0.021557,-0.166892,-0.013785,-0.020157,0.130410,0.042459,-0.546472,0.133871,-0.387513,-0.367666,-0.563361,0.231773,0.352244,-0.233120,0.031643,0.082262,0.525724,0.247824,-0.207455,-0.366197,0.652663,0.227508,-0.390924,0.693286,-0.200515,-0.090087,-0.343469,0.203983,-0.297496,-0.039235,0.508315,-0.038029,-0.388524,-0.091803,-0.137194,0.241283,-0.500043,-0.306679,1,0


In [38]:
# save the dataframe as a csv file
data_tfidf_word2vec.to_csv("data_tfidf_word2vec.csv")