# Objectives

***
## Obtain Data

In [1]:
# import libraries
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import Matcher
from collections import Counter
from wordcloud import WordCloud
from pickle import dump, load

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

from myFunc import *

The file *off_gt_3000.csv* is a subset of the *Office_Products.json* file.  It consists of office products that have a review count of more than 3000.
<p>We subset the data because the <i>Office_Products.json</i> file contains more than 5 million rows.  If we were to load all the data, we would not have enough system memory to build the model.</p>

In [2]:
# load amazon review data
df = pd.read_csv('data/off_gt_3000.csv', index_col=0)

# preview data
display(df.head())
print(f'\n Number of rows: {len(df)}')

Unnamed: 0,asin,overall,review
79839,B00000JRRD,5,I have had this pad for over a year now and I ...
79840,B00000JRRD,5,Comfortable hand and wrist pad. Easy to keep c...
79841,B00000JRRD,5,Belkin === quality;\nEnough said. Five Stars
79842,B00000JRRD,3,"Good enough I suppose, but not very much wrist..."
79843,B00000JRRD,1,The wrist pad started coming apart at the seam...



 Number of rows: 143631


***
## Clean Data

In [3]:
# create label from overall rating & drop overall column
rating_dict = {1:0, 2:0, 3:0, 4:1, 5:1 }
df['label'] = df['overall'].map(rating_dict)
df.drop(columns=['overall'], axis=1, inplace=True)

# preview data
df.head()

Unnamed: 0,asin,review,label
79839,B00000JRRD,I have had this pad for over a year now and I ...,1
79840,B00000JRRD,Comfortable hand and wrist pad. Easy to keep c...,1
79841,B00000JRRD,Belkin === quality;\nEnough said. Five Stars,1
79842,B00000JRRD,"Good enough I suppose, but not very much wrist...",0
79843,B00000JRRD,The wrist pad started coming apart at the seam...,0


In [4]:
# check null values
df.isnull().sum()

asin        0
review    104
label       0
dtype: int64

In [5]:
# drop rows with no review texts
df.dropna(inplace=True)

# verify drop
df.isnull().sum()

asin      0
review    0
label     0
dtype: int64

In [6]:
# check review text column that contain no texts and only spaces
no_texts = []
for ind, asin, review, lable in df.itertuples():
    if type(review) == str:
        if review.isspace():
            no_texts.append(ind)
            
# verify empty string in review column
print(f'Reviews with empty string: {len(no_texts)}')

Reviews with empty string: 0


In [7]:
# check pos vs neg ratio
df['label'].value_counts(normalize=True)

1    0.807987
0    0.192013
Name: label, dtype: float64

***
## Explore Data

In [8]:
# load the spacy eng library
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
nlp.max_length = 23395460 

### Add/Remove Stop Words

In [9]:
# add new stop words
# new_sw = [' ', '\n', '\n\n']
# for w in new_sw:
#     nlp.Defaults.stop_words.add(w)
#     nlp.vocab[w].is_stop = True

In [10]:
# remove default stop-words
rm_sw = ['no', 'not']
for w in rm_sw:
    nlp.Defaults.stop_words.remove(w)
    nlp.vocab[w].is_stop = False

### Generate and Plot Most Used Words

***
## Model C - LSTM Keep Upper Case

In [9]:
# import library and set random seed
import random
random.seed(123)

### Define X and y

In [10]:
# ⏰ This cell takes about 00:05:45 to run 
# define X
X = []

# convert series to list
sentences = list(df['review'])

# remove html tags and punct.
for sen in sentences:
    X.append(sep_punc(sen, lower=False))
    
# preview processed texts
X[0:3]

["I have had this pad for over a year now and I just could n't go without it When I was on the computer alot my hand would start to hurt but with the gel pad I no longer have any pain It is very soft and supports my wrist very well Overall this is a great mouse pad with great wrist support Gel is very comfortable",
 'Comfortable hand and wrist pad Easy to keep clean and very functional I would definitely buy again and recommend Comfy',
 'Belkin quality Enough said Five Stars']

In [11]:
# save X
dump(X, open('X_keepCase', 'wb'))

In [14]:
# define the target
y = np.array(df['label'])

### Create Train and Test Data

In [15]:
# create train and test data set
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y, test_size=0.30, random_state=36, shuffle=True)

### Create Embedded Layer

In [20]:
# import library
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras.layers import Embedding, Flatten, LSTM
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Sequential, save_model, load_model



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [21]:
#  create a word-to-index dictionary.
'''
fit_on_texts Updates internal vocabulary based on a list of texts. This method
creates the vocabulary index based on word frequency. So if you give it
something like, "The cat sat on the mat." It will create a dictionary
s.t. word_index["the"] = 1; word_index["cat"] = 2 it is word -> index dictionary
so every word gets a unique integer value. 0 is reserved for padding.
So lower integer means more frequent word (often the first few are stop words
because they appear a lot).
'''
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_c)

# create text sequences
'''
texts_to_sequences Transforms each text in texts to a sequence of integers.
So it basically takes each word in the text and replaces it with its 
corresponding integer value from the word_index dictionary.
'''
X_train_c_ts = tokenizer.texts_to_sequences(X_train_c)
X_test_c_ts = tokenizer.texts_to_sequences(X_test_c)

# preview train data
print(X_train_c[0])
print('\n')
print(X_train_c_ts[0])

Great household shredder Easily handles several sheets is reasonably quiet and has n't given us any trouble so far Five Stars


[12, 1264, 52, 226, 1307, 283, 196, 9, 1305, 331, 4, 54, 22, 697, 601, 117, 656, 23, 116, 27, 20]


In [22]:
# perform sequence padding 
X_train_c_ts = pad_sequences(X_train_c_ts, maxlen=100, padding='post')
X_test_c_ts = pad_sequences(X_test_c_ts, maxlen=100, padding='post')

X_train_c_ts[0]

array([  12, 1264,   52,  226, 1307,  283,  196,    9, 1305,  331,    4,
         54,   22,  697,  601,  117,  656,   23,  116,   27,   20,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [23]:
# check number of unique words 
vocab_size = len(tokenizer.word_counts)
print(vocab_size)

31942


In [24]:
# load pre-trained word embedding matrix
embed_dict = dict()
glove_file = open('data/glove.6B.100d.txt', encoding='utf8')

for line in glove_file:
    # split the line
    row = line.split()
    # get the word
    word = row[0]
    # assign the vector to array
    vector_dimensions = np.asarray(row[1:], dtype='float32')
    # pair the word and the vector
    embed_dict[word] = vector_dimensions

glove_file.close()

# preview matrix
embed_dict['great']

array([-0.013786 ,  0.38216  ,  0.53236  ,  0.15261  , -0.29694  ,
       -0.20558  , -0.41846  , -0.58437  , -0.77355  , -0.87866  ,
       -0.37858  , -0.18516  , -0.128    , -0.20584  , -0.22925  ,
       -0.42599  ,  0.3725   ,  0.26077  , -1.0702   ,  0.62916  ,
       -0.091469 ,  0.70348  , -0.4973   , -0.77691  ,  0.66045  ,
        0.09465  , -0.44893  ,  0.018917 ,  0.33146  , -0.35022  ,
       -0.35789  ,  0.030313 ,  0.22253  , -0.23236  , -0.19719  ,
       -0.0053125, -0.25848  ,  0.58081  , -0.10705  , -0.17845  ,
       -0.16206  ,  0.087086 ,  0.63029  , -0.76649  ,  0.51619  ,
        0.14073  ,  1.019    , -0.43136  ,  0.46138  , -0.43585  ,
       -0.47568  ,  0.19226  ,  0.36065  ,  0.78987  ,  0.088945 ,
       -2.7814   , -0.15366  ,  0.01015  ,  1.1798   ,  0.15168  ,
       -0.050112 ,  1.2626   , -0.77527  ,  0.36031  ,  0.95761  ,
       -0.11385  ,  0.28035  , -0.02591  ,  0.31246  , -0.15424  ,
        0.3778   , -0.13599  ,  0.2946   , -0.31579  ,  0.4294

In [25]:
# create an embedding matrix where each row number will correspond to
# the index of the word in the corpus

vocab_size = len(tokenizer.word_index) + 1
embed_matrix = np.zeros((vocab_size, 100))

for word, index in tokenizer.word_index.items():
    embed_vector = embed_dict.get(word)
    if embed_vector is not None:
        embed_matrix[index] = embed_vector
        
# check shape of the matrix
embed_matrix.shape

(31943, 100)

In [27]:
# define embedding layer parameters
embed_layer = Embedding(vocab_size, 100, weights=[embed_matrix],
                            input_length=100 , trainable=False)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


### Create and Fit Model 

In [42]:
# create object
model_d = Sequential()

# add layer
model_d.add(embed_layer)
model_d.add(LSTM(200, return_sequences=True))
model_d.add(GlobalMaxPool1D())
model_d.add(Dense(100, activation='relu')) # prec0: .93, prc1: .94, rec0: .78, rec1:.98, acc:.947
# model_d.add(Dense(50, activation='relu'))  # prec0: .78, prc1: .98, rec0: .93, rec1:.93, acc:.938
model_d.add(Dense(1, activation='sigmoid'))

# complile model 
model_d.compile(loss='binary_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])

# view model summary
model_d.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          3194300   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 200)          240800    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 3,455,301
Trainable params: 261,001
Non-trainable params: 3,194,300
_________________________________________________________________


In [43]:
# ⏰ This cell takes about 00:12:30 to run 
# fit model
history_d = model_d.fit(X_train_c_ts, y_train_c, epochs=10,
                        batch_size=128, validation_split=0.1)

Train on 90421 samples, validate on 10047 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
# save model
# model_d.save('sentiment_LSTM_keepCase_2Dense.h5')
# dump(tokenizer, open('my_tokenizer_keepCase', 'wb'))

# model = load_model('sentiment_LSTM.h5')
#tokenizer = load(open('my_tokenizer', 'rb'))

In [44]:
# ⏰ This cell takes about 00:05:00 to run 
# compare train and test accuracy scores
eval_train_test(model_d, X_train_c_ts, y_train_c, X_test_c_ts, y_test_c)



Train Accuracy: [1m 0.9877[0m
Test Accuracy: [1m 0.9476[0m


Percent difference: [1m -0.04[0m


In [45]:
# plot train & validation accuracy and lost per epoch
plot_acc_los_epoch(history_d, 10)

In [46]:
# ⏰ This cell takes about 00:01:30 to run 
pred_mld_d = model_d.predict(X_test_c_ts)
pred_mld_d_label = [0 if x < 0.5 else 1 for x in pred_mld_d]

In [47]:
get_clsrpt_confmat(y_test_c, pred_mld_d_label)

Unnamed: 0,precision,recall,f1-score,support
0,0.842424,0.898471,0.869545,8372.0
1,0.975095,0.959437,0.967203,34687.0
accuracy,0.947584,0.947584,0.947584,0.947584
macro avg,0.908759,0.928954,0.918374,43059.0
weighted avg,0.9493,0.947584,0.948215,43059.0
