## Loading Data

In [7]:
import sqlite3
import pandas as pd

con = sqlite3.connect("dataset/amazon-fire-food-review/database.sqlite")

filtered_data = pd.read_sql_query("""
SELECT * 
FROM Reviews 
WHERE Score != 3
""",con)

filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Changing rating to postive or negative

In [8]:
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative

filtered_data.shape
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Data Cleaning: Deduplication

In [9]:
sorted = filtered_data.sort_values('ProductId',axis=0, ascending=True, inplace=False, kind="quicksort",na_position="last")
final = sorted.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},inplace=False,keep="first")
final.shape

(364173, 10)

In [10]:
(final["Id"].size*1.0/filtered_data["Id"].size*1.0)*100

69.25890143662969

Remove values where HelpfullnessNumerator is greater than denominator

In [11]:
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]
final.shape

(364171, 10)

### Text Preprocessing 

In [12]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/d2c/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
def clean_html(sentence):
    cleaner = re.compile('<.*?>')
    cleaned_sentence = re.sub(cleaner,' ',sentence)
    return cleaned_sentence

def clean_punctuation(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned

stop = set(stopwords.words('english'))

sno = SnowballStemmer('english')
sno.stem('tasty')

'tasti'

In [15]:
strl = ' '
final_string = []
all_positive_words = []
all_negative_words = []
i=0
s=''

for sentence in final['Text'].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for w in sentence.split():
        for cleaned_word in clean_punctuation(w).split(): 
            if ((cleaned_word.isalpha()) and (len(cleaned_word) > 2)):
                if (cleaned_word.lower() not in stop):
                    s = (sno.stem(cleaned_word.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive':
                        all_positive_words.append(s)
                    if (final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    strl = b" ".join(filtered_sentence)
    final_string.append(strl)
    i+=1

In [16]:
final_string[:10]

[b'witti littl book make son laugh loud recit car drive along alway sing refrain hes learn whale india droop love new word book introduc silli classic book will bet son still abl recit memori colleg',
 b'grew read sendak book watch realli rosi movi incorpor love son love howev miss hard cover version paperback seem kind flimsi take two hand keep page open',
 b'fun way children learn month year learn poem throughout school year like handmot invent poem',
 b'great littl book read nice rhythm well good repetit littl one like line chicken soup rice child get month year wonder place like bombay nile eat well know get eat kid mauric sendak version ice skate treat rose head long time wont even know came surpris came littl witti book',
 b'book poetri month year goe month cute littl poem along love book realli fun way learn month poem creativ author purpos write book give children fun way learn month children also learn thing poetri rhythm read book',
 b'charm rhyme book describ circumst eat do

In [17]:
final["CleanedText"] = final_string

In [18]:
conn = sqlite3.connect('./dataset/final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

### Bag of Words

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
final_counts = count_vector.fit_transform(final['CleanedText'].values)
type(final_counts)

scipy.sparse.csr.csr_matrix

In [23]:
final_counts.get_shape()

(364171, 71624)

In [24]:
print(count_vector.get_feature_names()[:10])

['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaaaaaaaaaa', 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 'aaaaaaaaaaaaaaaaaaaargh', 'aaaaaaaaagghh', 'aaaaaaah', 'aaaaaaahhhhhh']


In [26]:
frequency_positive = nltk.FreqDist(all_positive_words)
frequency_negative = nltk.FreqDist(all_negative_words)

print("Most common positive words: ", frequency_positive.most_common(20))
print("\n")
print("Most common positive words: ", frequency_negative.most_common(20))

Most common positive words:  [(b'like', 139429), (b'tast', 129047), (b'good', 112766), (b'flavor', 109624), (b'love', 107357), (b'use', 103888), (b'great', 103870), (b'one', 96726), (b'product', 91033), (b'tri', 86791), (b'tea', 83888), (b'coffe', 78814), (b'make', 75107), (b'get', 72125), (b'food', 64802), (b'would', 55568), (b'time', 55264), (b'buy', 54198), (b'realli', 52715), (b'eat', 52004)]


Most common positive words:  [(b'tast', 34585), (b'like', 32330), (b'product', 28218), (b'one', 20569), (b'flavor', 19575), (b'would', 17972), (b'tri', 17753), (b'use', 15302), (b'good', 15041), (b'coffe', 14716), (b'get', 13786), (b'buy', 13752), (b'order', 12871), (b'food', 12754), (b'dont', 11877), (b'tea', 11665), (b'even', 11085), (b'box', 10844), (b'amazon', 10073), (b'make', 9840)]


As we can observe that the words occuring in positive as well as negative are quite similar therefore we can predict based on the words alone.

### Bi-gram and n-gram

In [27]:
count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram_counts = count_vect.fit_transform(final["Text"])
final_bigram_counts.get_shape()

(364171, 2910192)

### TF-IDF

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tfidf = tfidf_vect.fit_transform(final["CleanedText"])
final_tfidf.get_shape()

(364171, 2923725)

<strong>Get top tf-idf of a row</strong>

In [34]:
import numpy as np

features = tfidf_vect.get_feature_names()

def get_top_tfidf(row, features, n_top=25):
    top_indices = np.argsort(row)[::-1][:n_top]
    top_features = [(features[i],row[i]) for i in top_indices]
    df  = pd.DataFrame(top_features)
    df.columns = ['features','tfidf']
    return df

In [38]:
top_tfidf = get_top_tfidf(final_tfidf[1,:].toarray()[0],features,25)
top_tfidf

Unnamed: 0,features,tfidf
0,page open,0.192673
1,read sendak,0.192673
2,movi incorpor,0.192673
3,paperback seem,0.192673
4,version paperback,0.192673
5,flimsi take,0.192673
6,incorpor love,0.192673
7,rosi movi,0.192673
8,keep page,0.192673
9,grew read,0.192673


### Word2Vec

#### Training word2vec model

In [40]:
import gensim

i = 0 
list_of_sentence = []
for sentence in final["Text"].values:
    filtered_sentence = []
    sentence = clean_html(sentence)
    for w in sentence.split():
        for cleaned_word in clean_punctuation(w).split():
            if cleaned_word.isalpha():
                filtered_sentence.append(cleaned_word.lower())
            else:
                continue
    list_of_sentence.append(filtered_sentence)



In [49]:
print(list_of_sentence[0])

['this', 'witty', 'little', 'book', 'makes', 'my', 'son', 'laugh', 'at', 'loud', 'i', 'recite', 'it', 'in', 'the', 'car', 'as', 'were', 'driving', 'along', 'and', 'he', 'always', 'can', 'sing', 'the', 'refrain', 'hes', 'learned', 'about', 'whales', 'india', 'drooping', 'i', 'love', 'all', 'the', 'new', 'words', 'this', 'book', 'introduces', 'and', 'the', 'silliness', 'of', 'it', 'all', 'this', 'is', 'a', 'classic', 'book', 'i', 'am', 'willing', 'to', 'bet', 'my', 'son', 'will', 'still', 'be', 'able', 'to', 'recite', 'from', 'memory', 'when', 'he', 'is', 'in', 'college']


In [48]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(list_of_sentence ,min_count=5,vector_size=50, workers=4)

In [55]:
words = list(w2v_model.wv.key_to_index)
len(words)

33783

In [59]:
w2v_model.wv.most_similar('tasty')

[('tastey', 0.9014021158218384),
 ('yummy', 0.8504840135574341),
 ('satisfying', 0.8436217904090881),
 ('filling', 0.8280801773071289),
 ('delicious', 0.8170642256736755),
 ('flavorful', 0.7954986095428467),
 ('tasteful', 0.7861400246620178),
 ('delectable', 0.75958251953125),
 ('addicting', 0.7516376972198486),
 ('nutritious', 0.7488244771957397)]

### Average Word2Vec and TF-IDF Word2Vec

#### Average Word2Vec

In [60]:
sent_vectors= []
for sent in list_of_sentence:
    sent_vec = np.zeros(50)
    count_words =0
    for word in sent:
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            count_words += 1
        except:
            pass
    sent_vec /= count_words
    sent_vectors.append(sent_vec)

print(len(sent_vectors))
print(len(sent_vectors[0]))

  sent_vec /= count_words


364171
50


#### TF-IDF Word2Vec

In [None]:
tfidf_sent_vectors= []
row =0
for sent in list_of_sentence:
    sent_vec = np.zeros(50)
    weight_sum =0
    for word in sent:
        try:
            vec = w2v_model.wv[word]
            tfidf = final_tfidf[row,features.index(word)]
            sent_vec += (vec*tfidf)
            weight_sum += tfidf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row+=1

print(len(tfidf_sent_vectors))
print(len(tfidf_sent_vectors[0]))