## Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import Counter 
from sklearn.model_selection import train_test_split


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
np.random.seed(7)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Loading data

In [2]:
con = sqlite3.connect('amazon-fine-food-reviews\database.sqlite')
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3""", con) 
def partition(x):
    if x < 3:
        return 0
    return 1
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative
print("Number of data points in our data", filtered_data.shape)
filtered_data.head(3)

Number of data points in our data (525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [3]:
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [4]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
print(final.shape)
final['Score'].value_counts()

(364171, 10)


1    307061
0     57110
Name: Score, dtype: int64

In [5]:
# printing some random reviews
sent_0 = final['Text'].values[0]
print(sent_0)
print("="*50)

sent_1000 = final['Text'].values[1000]
print(sent_1000)
print("="*50)

sent_1500 = final['Text'].values[1500]
print(sent_1500)
print("="*50)

sent_4900 = final['Text'].values[4900]
print(sent_4900)
print("="*50)

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
I was really looking forward to these pods based on the reviews.  Starbucks is good, but I prefer bolder taste.... imagine my surprise when I ordered 2 boxes - both were expired! One expired back in 2005 for gosh sakes.  I admit that Amazon agreed to credit me for cost plus part of shipping, but geez, 2 years expired!!!  I'm hoping to find local San Diego area shoppe that carries pods so that I can try something different than starbucks.
Great ingredients although, chicken should have been 1st rather than chicken broth, the only thing I do not think belongs in it is Canola oil. Canola or rapeseed is not someting a do

In [6]:
# remove urls from text
sent_0 = re.sub(r"http\S+", "", sent_0)
sent_1000 = re.sub(r"http\S+", "", sent_1000)
sent_150 = re.sub(r"http\S+", "", sent_1500)
sent_4900 = re.sub(r"http\S+", "", sent_4900)
print(sent_0)

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college


In [7]:
soup = BeautifulSoup(sent_0, 'html.parser')
text = soup.get_text()
print(text)
print("="*50)

soup = BeautifulSoup(sent_1000, 'html.parser')
text = soup.get_text()
print(text)
print("="*50)

soup = BeautifulSoup(sent_1500, 'html.parser')
text = soup.get_text()
print(text)
print("="*50)

soup = BeautifulSoup(sent_4900, 'html.parser')
text = soup.get_text()
print(text)

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
I was really looking forward to these pods based on the reviews.  Starbucks is good, but I prefer bolder taste.... imagine my surprise when I ordered 2 boxes - both were expired! One expired back in 2005 for gosh sakes.  I admit that Amazon agreed to credit me for cost plus part of shipping, but geez, 2 years expired!!!  I'm hoping to find local San Diego area shoppe that carries pods so that I can try something different than starbucks.
Great ingredients although, chicken should have been 1st rather than chicken broth, the only thing I do not think belongs in it is Canola oil. Canola or rapeseed is not someting a do

In [8]:
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

sent_1500 = decontracted(sent_1500)
print(sent_1500)
print("="*50)

Great ingredients although, chicken should have been 1st rather than chicken broth, the only thing I do not think belongs in it is Canola oil. Canola or rapeseed is not someting a dog would ever find in nature and if it did find rapeseed in nature and eat it, it would poison them. Today is Food industries have convinced the masses that Canola oil is a safe and even better oil than olive or virgin coconut, facts though say otherwise. Until the late 70 is it was poisonous until they figured out a way to fix that. I still like it but it could be better.


In [9]:
#remove words with numbers
sent_0 = re.sub("\S*\d\S*", "", sent_0).strip()
print(sent_0)

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college


In [10]:
#remove spacial character
sent_1500 = re.sub('[^A-Za-z0-9]+', ' ', sent_1500)
print(sent_1500)

Great ingredients although chicken should have been 1st rather than chicken broth the only thing I do not think belongs in it is Canola oil Canola or rapeseed is not someting a dog would ever find in nature and if it did find rapeseed in nature and eat it it would poison them Today is Food industries have convinced the masses that Canola oil is a safe and even better oil than olive or virgin coconut facts though say otherwise Until the late 70 is it was poisonous until they figured out a way to fix that I still like it but it could be better 


In [11]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [12]:
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(final['Text'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'html.parser').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sentance.strip())
preprocessed_reviews[1500]

100%|██████████| 364171/364171 [01:51<00:00, 3277.28it/s]


'great ingredients although chicken rather chicken broth thing not think belongs canola oil canola rapeseed not someting dog would ever find nature find rapeseed nature eat would poison today food industries convinced masses canola oil safe even better oil olive virgin coconut facts though say otherwise late poisonous figured way fix still like could better'

## Creating a list of all the words present in reviews

In [13]:
vocabulary = []
for sentance in preprocessed_reviews:
    list_of_sentance = sentance.split()
    for word in list_of_sentance:
        vocabulary.append(word)
vocabualry = vocabulary.sort()
len(vocabulary)

14435977

## Getting the frequency of each word

In [14]:
word_frequency = Counter(vocabulary)
word_frequency = word_frequency.most_common()
word_frequency

[('not', 381088),
 ('like', 154620),
 ('good', 125152),
 ('great', 109229),
 ('one', 107916),
 ('taste', 105882),
 ('product', 98760),
 ('would', 95865),
 ('coffee', 88586),
 ('flavor', 88355),
 ('tea', 85906),
 ('love', 79982),
 ('no', 67999),
 ('get', 66960),
 ('food', 63268),
 ('really', 62010),
 ('amazon', 59551),
 ('use', 57390),
 ('much', 57143),
 ('also', 54665),
 ('time', 54163),
 ('little', 51565),
 ('best', 50593),
 ('find', 49327),
 ('buy', 49281),
 ('price', 48944),
 ('make', 48622),
 ('well', 47635),
 ('tried', 47350),
 ('even', 47178),
 ('better', 43537),
 ('try', 43384),
 ('chocolate', 40677),
 ('eat', 39645),
 ('sugar', 38470),
 ('first', 37801),
 ('water', 37668),
 ('used', 36866),
 ('could', 36473),
 ('found', 35276),
 ('made', 33603),
 ('sweet', 33374),
 ('free', 32874),
 ('bag', 32798),
 ('bought', 32479),
 ('drink', 32066),
 ('dog', 31894),
 ('box', 31830),
 ('cup', 31792),
 ('store', 30983),
 ('way', 30932),
 ('two', 29763),
 ('delicious', 29449),
 ('tastes', 2934

## Ranking the words based on frequency and creating dictionary

In [15]:
i=1
word_dict = dict(word_frequency)
word_dict
for key in word_dict:
    word_dict[key] = i
    i+=1
word_dict

{'not': 1,
 'like': 2,
 'good': 3,
 'great': 4,
 'one': 5,
 'taste': 6,
 'product': 7,
 'would': 8,
 'coffee': 9,
 'flavor': 10,
 'tea': 11,
 'love': 12,
 'no': 13,
 'get': 14,
 'food': 15,
 'really': 16,
 'amazon': 17,
 'use': 18,
 'much': 19,
 'also': 20,
 'time': 21,
 'little': 22,
 'best': 23,
 'find': 24,
 'buy': 25,
 'price': 26,
 'make': 27,
 'well': 28,
 'tried': 29,
 'even': 30,
 'better': 31,
 'try': 32,
 'chocolate': 33,
 'eat': 34,
 'sugar': 35,
 'first': 36,
 'water': 37,
 'used': 38,
 'could': 39,
 'found': 40,
 'made': 41,
 'sweet': 42,
 'free': 43,
 'bag': 44,
 'bought': 45,
 'drink': 46,
 'dog': 47,
 'box': 48,
 'cup': 49,
 'store': 50,
 'way': 51,
 'two': 52,
 'delicious': 53,
 'tastes': 54,
 'order': 55,
 'since': 56,
 'day': 57,
 'think': 58,
 'go': 59,
 'mix': 60,
 'recommend': 61,
 'nice': 62,
 'still': 63,
 'many': 64,
 'know': 65,
 'bit': 66,
 'add': 67,
 'got': 68,
 'never': 69,
 'hot': 70,
 'milk': 71,
 'give': 72,
 'favorite': 73,
 'stuff': 74,
 'want': 75,
 

In [16]:
len(word_dict)

117492

In [17]:
preprocessed_reviews

['witty little book makes son laugh loud recite car driving along always sing refrain learned whales india drooping roses love new words book introduces silliness classic book willing bet son still able recite memory college',
 'grew reading sendak books watching really rosie movie incorporates love son loves however miss hard cover version paperbacks seem kind flimsy takes two hands keep pages open',
 'fun way children learn months year learn poems throughout school year like handmotions invent poem',
 'great little book read aloud nice rhythm well good repetition little ones like lines chicken soup rice child gets go months year go wonderful places like bombay nile eating well know get eat kids maurice sendak version ice skating treat roses heads long time not even know came surprise came little witty book',
 'book poetry months year goes month cute little poem go along love book really fun way learn months poems creative author purpose writing book give children fun way learn months

In [18]:
preprocessed_reviews[1] 

'grew reading sendak books watching really rosie movie incorporates love son loves however miss hard cover version paperbacks seem kind flimsy takes two hands keep pages open'

In [19]:
preprocessed_reviews[0] 

'witty little book makes son laugh loud recite car driving along always sing refrain learned whales india drooping roses love new words book introduces silliness classic book willing bet son still able recite memory college'

## Replacing the words in the review with it's rank

In [20]:
list_of_sentance = []
ranked_reviews = []
for sentance in preprocessed_reviews:
    for j in sentance.split():
        j = word_dict[j]
        list_of_sentance.append(str(j))
        str1 = " ".join(list_of_sentance)
    ranked_reviews.append(str1.split(" "))
    list_of_sentance.clear()
ranked_reviews

[['22917',
  '22',
  '1408',
  '77',
  '364',
  '4907',
  '4430',
  '28040',
  '1264',
  '3279',
  '505',
  '79',
  '7358',
  '12074',
  '1261',
  '33674',
  '2210',
  '23184',
  '3861',
  '12',
  '153',
  '1616',
  '1408',
  '16680',
  '25016',
  '1431',
  '1408',
  '1789',
  '2289',
  '364',
  '63',
  '239',
  '28040',
  '3347',
  '1936'],
 ['1105',
  '686',
  '19714',
  '3354',
  '1421',
  '16',
  '18554',
  '1545',
  '12454',
  '12',
  '364',
  '105',
  '114',
  '1197',
  '112',
  '1391',
  '388',
  '53489',
  '340',
  '251',
  '4213',
  '551',
  '52',
  '778',
  '102',
  '7976',
  '329'],
 ['677',
  '51',
  '830',
  '1978',
  '226',
  '182',
  '1978',
  '30045',
  '1610',
  '1066',
  '182',
  '2',
  '81767',
  '14830',
  '15771'],
 ['4',
  '22',
  '1408',
  '330',
  '22961',
  '62',
  '14903',
  '28',
  '3',
  '20307',
  '22',
  '272',
  '2',
  '3560',
  '154',
  '260',
  '147',
  '1025',
  '419',
  '59',
  '226',
  '182',
  '59',
  '133',
  '1542',
  '2',
  '11259',
  '13931',
  

In [21]:
len(preprocessed_reviews) == len(ranked_reviews)

True

In [22]:
len(ranked_reviews)

364171

In [23]:
max_review_length = len(max(ranked_reviews, key = len))
max_review_length

1596

## Creating train and test sets

In [24]:
X_train, X_test, y_train, y_test = train_test_split(ranked_reviews, final['Score'], test_size = 0.30, random_state = 0)
print(X_train[1])
print(type(X_train[1]))
print(len(X_train[1]))

['61', '1004', '4648', '37', '194', '1', '1214', '60', '1214', '37', '194', '22', '37', '421', '592', '303', '3918', '964', '242', '22', '652', '1783', '2103', '2947', '592', '181', '146', '269', '911', '2947', '14', '3687', '31', '964', '37', '194', '36', '1204', '60', '527', '10', '6', '170', '3', '437', '388', '1756', '843', '6', '160', '14', '38', '843', '6', '6', '1187', '14', '1163', '6', '322', '166', '35', '664', '1952', '2838', '169', '804', '843', '551', '14', '38', '843', '971', '1', '25', '890', '843', '58', '843', '4770', '380', '60', '46']
<class 'list'>
83


In [25]:
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
print(X_train.shape)
print(X_train[1])

(254919, 1596)
[  0   0   0 ... 380  60  46]


In [26]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(len(word_dict)+1, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1596, 32)          3759776   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 3,813,077
Trainable params: 3,813,077
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/10
Epoch 2/10
 10176/254919 [>.............................] - ETA: 3:40:25 - loss: 0.1429 - acc: 0.9471