In [1]:
### Load all the required libraries/modules
import os
import pandas as pd
import numpy as np


##tensorflow modules
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential

## text preprocessing modules
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
import re
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
### Load the data
with open("/content/drive/MyDrive/Cell_Phones_and_Accessories_5.json", "r") as f:
    data = pd.read_json(f, orient="records", lines=True)

In [4]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


- As we can see now there are two columns in the data- The review and its sentiment.

In [5]:
### Take the reviews text as a list of elements and then perform the basic preprocessing 
reviews= list(data['reviewText'])
reviews[0:2]

["They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again",
 'These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)']

In [6]:
### Basic text preprocessing where we remove special characters, any tags etc
#### Removal of stopwords
text=[]
for r in reviews:
  text.append(remove_stopwords(r))

text[0:3]

["They look good stick good! I don't like rounded shape I bumping Siri kept popping irritating. I won't buy product like",
 'These stickers work like review says do. They stick great stay phone. They super stylish I share sister. :)',
 'These awesome phone look stylish! I far year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!']

In [7]:
### Removal of tags/special characters

### Basic text preprocessing where we remove special characters, any tags etc
#### Removal of stopwords
#### Removal of tags
text=[]
for r in reviews:
  r = re.sub(r'<.*?>',"",r)
  text.append(remove_stopwords(r))

text[0:3]



["They look good stick good! I don't like rounded shape I bumping Siri kept popping irritating. I won't buy product like",
 'These stickers work like review says do. They stick great stay phone. They super stylish I share sister. :)',
 'These awesome phone look stylish! I far year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!']

In [8]:
### Basic text preprocessing where we remove special characters, any tags etc
#### Removal of stopwords
#### Removal of tags
#### Removal of special characters
#### Removal of email id
#### Removal of numbers
text=[]
for r in reviews:
  r = re.sub(r'<.*?>',"",r)
  r = re.sub(r'\S+@\S+\s','',r)
  r = re.sub(r'\d+','',r)
  r = re.sub(r'[#$!\*\)\(\\%:;,\'_-]','',r)
  text.append(remove_stopwords(r))

text[0:3]


['They look good stick good I dont like rounded shape I bumping Siri kept popping irritating. I wont buy product like',
 'These stickers work like review says do. They stick great stay phone. They super stylish I share sister.',
 'These awesome phone look stylish I far year CAN YOU BELIEVE THAT ONE YEAR Great quality']

In [9]:
### Now the reviews are faily clean, we will tokenize the reviews and get word embeddings for these
tokenizer=Tokenizer(num_words=5000,lower=True,oov_token='UNK')


In [10]:
len(text)

194439

In [11]:
text 

['They look good stick good I dont like rounded shape I bumping Siri kept popping irritating. I wont buy product like',
 'These stickers work like review says do. They stick great stay phone. They super stylish I share sister.',
 'These awesome phone look stylish I far year CAN YOU BELIEVE THAT ONE YEAR Great quality',
 'Item arrived great time perfect condition. However I ordered buttons great deal included FREE screen protector. I received one. Though big deal wouldve nice claim comes one.',
 'awesome stays looks great. multiple apple products. especially having nails helps elevated key.',
 'These home button easy. My daughter I like them. I purchase again. Well worth price.',
 'Came described.. It doesnt come unstuck cute People ask I got & great driving.',
 'worked week charge phone . waste money.',
 'Good case solid build. Protects phone good access buttons. Battery charges battery lasts day. I usually leave house return pm. Im glad lasts start end. /',
 'This fantastic case. Very

In [12]:
type(text)

list

# Train Validation Test Splitting

In [13]:
## Before applying the tokenizer, lets split the data into train test
train_text= text[: int(len(text)*0.7)]
test_text = text[int(len(text)*0.7): int(len(text)*0.9)]
val_text = text[int(len(text)*0.9):]



In [14]:
Y=list(data['overall'])
train_y= Y[: int(len(text)*0.7)]
test_y = Y[int(len(text)*0.7):int(len(text)*0.9)]
val_y = Y[int(len(text)*0.9):]

print(len(train_y),len(test_y), len(val_y))

136107 38888 19444


In [15]:
len(val_text)

19444

In [16]:
len(test_text)

38888

In [17]:
len(train_text)

136107

In [18]:
### Now lets tokenize the reviews
tokenizer.fit_on_texts(train_text)
tokenizer.fit_on_texts(test_text)
tokenizer.fit_on_texts(val_text)


In [19]:
tokenizer.word_index

{'UNK': 1,
 'i': 2,
 'phone': 3,
 'case': 4,
 'the': 5,
 'it': 6,
 'like': 7,
 'this': 8,
 'great': 9,
 'use': 10,
 'screen': 11,
 'good': 12,
 'battery': 13,
 'iphone': 14,
 'charge': 15,
 'charger': 16,
 'product': 17,
 'time': 18,
 'dont': 19,
 'works': 20,
 's': 21,
 'little': 22,
 'nice': 23,
 'price': 24,
 'love': 25,
 'work': 26,
 'charging': 27,
 'quality': 28,
 'fit': 29,
 'im': 30,
 'easy': 31,
 'usb': 32,
 'device': 33,
 'its': 34,
 'protector': 35,
 'ive': 36,
 'power': 37,
 'got': 38,
 'better': 39,
 'bought': 40,
 'doesnt': 41,
 'cable': 42,
 'recommend': 43,
 'need': 44,
 'fits': 45,
 'if': 46,
 'cover': 47,
 'looks': 48,
 'buy': 49,
 'protection': 50,
 'cases': 51,
 'new': 52,
 'car': 53,
 'way': 54,
 'sound': 55,
 'my': 56,
 'phones': 57,
 'you': 58,
 'well': 59,
 'galaxy': 60,
 'thing': 61,
 'bluetooth': 62,
 'want': 63,
 'hard': 64,
 'but': 65,
 'so': 66,
 'devices': 67,
 'a': 68,
 'pretty': 69,
 'long': 70,
 'one': 71,
 'headset': 72,
 'samsung': 73,
 'bit': 74,
 'p

In [20]:
len(tokenizer.word_index)

108086

In [22]:

#### Now we need embeddings for these words  
## Get the glove vectors
embeddings_index= dict()
glove= open('/content/drive/MyDrive/glove.6B.200d.txt','r',encoding='utf-8')
for line in glove:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs

glove.close()


In [23]:
len(embeddings_index)

400000

In [25]:
### Create an embedding matrix for the vocabulary created for the reviews 
vocab= len(tokenizer.word_index)+1
embedding_matrix = np.zeros((vocab, 200)) 
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [26]:
embedding_matrix.shape

(108087, 200)

In [27]:
train_indices=tokenizer.texts_to_sequences(train_text)

In [28]:
train_indices[0:3]

[[97,
  77,
  12,
  500,
  12,
  2,
  19,
  7,
  1956,
  921,
  2,
  1,
  2009,
  651,
  2763,
  3466,
  2,
  165,
  49,
  17,
  7],
 [158,
  1813,
  26,
  7,
  106,
  482,
  363,
  97,
  500,
  9,
  368,
  3,
  97,
  300,
  859,
  2,
  1665,
  1667],
 [158, 267, 3, 77, 859, 2, 92, 303, 1071, 58, 584, 135, 71, 303, 9, 28]]

In [29]:
## As mentioned we are limiting the number of words say 64
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length=64


In [30]:
train_indices=pad_sequences(train_indices,maxlen=max_length,padding='post')
train_indices=np.asarray(train_indices)
train_y=np.asarray(train_y)

In [31]:
test_indices=tokenizer.texts_to_sequences(test_text)
test_indices=pad_sequences(test_indices,maxlen=max_length,padding='post')



test_indices=np.asarray(test_indices)
test_y=np.asarray(test_y)

val_indices=tokenizer.texts_to_sequences(val_text)
val_indices=pad_sequences(val_indices,maxlen=max_length,padding='post')

val_indices=np.asarray(val_indices)
val_y=np.asarray(val_y)

#RNN 

In [None]:
#####   RNN with 1 layer, 20 neurons
from keras.layers import RNN
model10=Sequential()
model10.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix]))
model10.add(SimpleRNN(20))
model10.add(Dense(6,activation='softmax'))
model10.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model10.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 100)           10808700  
                                                                 
 simple_rnn (SimpleRNN)      (None, 20)                2420      
                                                                 
 dense (Dense)               (None, 6)                 126       
                                                                 
Total params: 10,811,246
Trainable params: 10,811,246
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model10.fit(train_indices,train_y,batch_size=32,epochs=10)
# Final evaluation of the model
scores = model10.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 56.81%


In [None]:
#####   RNN with 1 layer, 50 neurons
from keras.layers import RNN
model11=Sequential()
model11.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix],trainable = False))
model11.add(SimpleRNN(50))
model11.add(Dense(6,activation='softmax'))
model11.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model11.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 64, 100)           10808700  
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 50)                7550      
                                                                 
 dense_1 (Dense)             (None, 6)                 306       
                                                                 
Total params: 10,816,556
Trainable params: 7,856
Non-trainable params: 10,808,700
_________________________________________________________________
None


In [None]:
model11.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7c7233bcd0>

In [None]:
# Final evaluation of the model
scores = model11.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))

Test Accuracy: 57.02%


In [None]:
#####   RNN with 1 layer, 100 neurons
from keras.layers import RNN
model12=Sequential()
model12.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix], trainable = False))
model12.add(SimpleRNN(100))
model12.add(Dropout(0.3))
model12.add(Dense(6,activation='softmax'))
model12.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model12.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 64, 100)           10808700  
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 100)               20100     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 6)                 606       
                                                                 
Total params: 10,829,406
Trainable params: 20,706
Non-trainable params: 10,808,700
_________________________________________________________________
None


In [None]:
model12.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f577ff6b590>

In [None]:
# Final evaluation of the model
scores12 = model12.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
#####   RNN with 1 layer, 200 neurons (dropout)
from keras.layers import RNN
model13=Sequential()
model13.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix], trainable= False))
model13.add(SimpleRNN(200))
model13.add(Dropout(0.3))
model13.add(BatchNormalization())
model13.add(Dense(6,activation='softmax'))
model13.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model13.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 64, 100)           10808700  
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 200)               60200     
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 batch_normalization (BatchN  (None, 200)              800       
 ormalization)                                                   
                                                                 
 dense_4 (Dense)             (None, 6)                 1206      
                                                                 
Total params: 10,870,906
Trainable params: 61,806
Non-trainable params: 10,809,100
_____________________________________

In [None]:
model13.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7c7228d550>

In [None]:
# Final evaluation of the model
scores13 = model13.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))

Test Accuracy: 57.02%


In [None]:
#####   RNN with 2 layers, 20 neurons each
from keras.layers import RNN
model14=Sequential()
model14.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix], trainable = False))
model14.add(SimpleRNN(20, return_sequences = True))
model14.add(SimpleRNN(20, return_sequences = False))
model14.add(Dense(6,activation='softmax'))
model14.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model14.summary())
model14.fit(train_indices,train_y,batch_size=32,epochs=5)

scores14 = model14.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 64, 100)           10808700  
                                                                 
 simple_rnn_10 (SimpleRNN)   (None, 64, 20)            2420      
                                                                 
 simple_rnn_11 (SimpleRNN)   (None, 20)                820       
                                                                 
 dense_5 (Dense)             (None, 6)                 126       
                                                                 
Total params: 10,812,066
Trainable params: 3,366
Non-trainable params: 10,808,700
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 57.02%


In [None]:
#####   RNN with 2 layers, 50 neurons each
from keras.layers import RNN
model15=Sequential()
model15.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix],trainable= False))
model15.add(SimpleRNN(50,return_sequences = True))
model15.add(SimpleRNN(50,return_sequences = False))
model15.add(Dense(6,activation='softmax'))
model15.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model15.summary())
model15.fit(train_indices,train_y,batch_size=32,epochs=10)
scores15 = model15.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))


Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 64, 100)           10808700  
                                                                 
 simple_rnn_16 (SimpleRNN)   (None, 64, 50)            7550      
                                                                 
 simple_rnn_17 (SimpleRNN)   (None, 50)                5050      
                                                                 
 dense_8 (Dense)             (None, 6)                 306       
                                                                 
Total params: 10,821,606
Trainable params: 12,906
Non-trainable params: 10,808,700
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 57.02%


In [None]:
#####   RNN with 2 layers, 100 neurons each
from keras.layers import RNN
model16=Sequential()
model16.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix],trainable= False))
model16.add(SimpleRNN(100,return_sequences = True))
model16.add(SimpleRNN(100,return_sequences = False))
model16.add(Dense(6,activation='softmax'))
model16.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model16.summary())
model16.fit(train_indices,train_y,batch_size=32,epochs=5)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 100)           10808700  
                                                                 
 simple_rnn (SimpleRNN)      (None, 64, 100)           20100     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100)               20100     
                                                                 
 dense (Dense)               (None, 6)                 606       
                                                                 
Total params: 10,849,506
Trainable params: 40,806
Non-trainable params: 10,808,700
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


NameError: ignored

In [None]:
scores16 = model16.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores16[1]*100))

Test Accuracy: 57.72%


In [None]:
#####   RNN with 2 layers, 200 neurons each
from keras.layers import RNN
model17=Sequential()
model17.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix],trainable= False))
model17.add(SimpleRNN(200,return_sequences = True))
model17.add(SimpleRNN(200,return_sequences = False))
model17.add(Dense(6,activation='softmax'))
model17.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model17.summary())
model17.fit(train_indices,train_y,batch_size=32,epochs=5)
scores17 = model17.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores17[1]*100))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 64, 100)           10808700  
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 64, 200)           60200     
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 200)               80200     
                                                                 
 dense_1 (Dense)             (None, 6)                 1206      
                                                                 
Total params: 10,950,306
Trainable params: 141,606
Non-trainable params: 10,808,700
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 57.72%


# LSTM 

In [None]:
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
#########                MODEL 1 (LSTM with 2 hidden layers and 100 neurons)
model= Sequential()
model.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix],trainable=False))
#adding a LSTM layer of dim 1--
model.add(LSTM(100, return_sequences=True));
model.add(LSTM(100, return_sequences=False));
#adding a dense layer with activation function of relu
model.add(Dense(100, activation='relu'));#best 50,relu
#adding the final output activation with activation function of softmax
model.add(Dense(6, activation='softmax'));
#model.add(LSTM(32))
#model.add(Dense(6, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 100)           10808700  
                                                                 
 lstm (LSTM)                 (None, 64, 100)           80400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 6)                 606       
                                                                 
Total params: 10,980,206
Trainable params: 171,506
Non-trainable params: 10,808,700
_________________________________________________________________
None


In [None]:
data.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [None]:
data['overall'].value_counts()

5    108664
4     39993
3     21439
1     13279
2     11064
Name: overall, dtype: int64

In [None]:
model.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5780d73210>

In [None]:
# Final evaluation of the model
scores = model.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 64.83%


In [None]:
scores

[0.9088448882102966, 0.6483491063117981]

In [None]:
#####        MODEL 2 : LSTM with 2 layers, 200 neurons
model2= Sequential()
model2.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix],trainable=False))
#adding a LSTM layer of dim 1--
model2.add(LSTM(200, return_sequences=True));
model2.add(LSTM(200, return_sequences=False));
#adding a dense layer with activation function of relu
model2.add(Dense(200, activation='relu'));#best 50,relu
#adding the final output activation with activation function of softmax
model2.add(Dense(6, activation='softmax'));
#model.add(LSTM(32))
#model.add(Dense(6, activation='softmax'))
model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 64, 100)           10808700  
                                                                 
 lstm_2 (LSTM)               (None, 64, 200)           240800    
                                                                 
 lstm_3 (LSTM)               (None, 200)               320800    
                                                                 
 dense_2 (Dense)             (None, 200)               40200     
                                                                 
 dense_3 (Dense)             (None, 6)                 1206      
                                                                 
Total params: 11,411,706
Trainable params: 603,006
Non-trainable params: 10,808,700
_________________________________________________________________
None


In [None]:
model2.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f578061d610>

In [None]:
# Final evaluation of the model
scores_test = model2.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))

#scores = model2.evaluate(val_indices, val_y, verbose=0)
#print("Validation Accuracy: %.2f%%" % (scores[1]*100))

Test Accuracy: 64.83%


In [33]:
#####        MODEL 3 : LSTM with 2 layers, 50 neurons
model8= Sequential()
model8.add(Embedding(input_dim=vocab,output_dim=200,input_length=max_length,weights=[embedding_matrix],trainable=False))
#adding a LSTM layer of dim 1--
model8.add(LSTM(50, return_sequences=True));
model8.add(LSTM(50, return_sequences=False));
#adding a dense layer with activation function of relu
model8.add(Dense(50, activation='relu'));#best 50,relu
#adding the final output activation with activation function of softmax
model8.add(Dense(6, activation='softmax'));
#model.add(LSTM(32))
#model.add(Dense(6, activation='softmax'))
model8.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model8.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 64, 200)           21617400  
                                                                 
 lstm (LSTM)                 (None, 64, 50)            50200     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dense_1 (Dense)             (None, 6)                 306       
                                                                 
Total params: 21,690,656
Trainable params: 73,256
Non-trainable params: 21,617,400
_________________________________________________________________
None


In [34]:
model8.fit(train_indices,train_y,batch_size=32,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f400cbad7d0>

In [37]:
# Final evaluation of the model
scores_test8 = model8.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores_test8[1]*100))



Test Accuracy: 64.99%


In [39]:
#####        MODEL 4 : LSTM with 2 layers, 20 neurons
model9= Sequential()
model9.add(Embedding(input_dim=vocab,output_dim=200,input_length=max_length,weights=[embedding_matrix],trainable=False))
#adding a LSTM layer of dim 1--
model9.add(LSTM(20, return_sequences=True));
model9.add(LSTM(20, return_sequences=False));
#adding a dense layer with activation function of relu
model9.add(Dense(50, activation='relu'));#best 50,relu
#adding the final output activation with activation function of softmax
model9.add(Dense(6, activation='softmax'));
#model.add(LSTM(32))
#model.add(Dense(6, activation='softmax'))
model9.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model9.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 64, 200)           21617400  
                                                                 
 lstm_5 (LSTM)               (None, 64, 20)            17680     
                                                                 
 lstm_6 (LSTM)               (None, 20)                3280      
                                                                 
 dense_2 (Dense)             (None, 50)                1050      
                                                                 
 dense_3 (Dense)             (None, 6)                 306       
                                                                 
Total params: 21,639,716
Trainable params: 22,316
Non-trainable params: 21,617,400
_________________________________________________________________
None


In [40]:
model9.fit(train_indices,train_y,batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f400c363350>

In [41]:
# Final evaluation of the model
scores_test9 = model9.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores_test9[1]*100))

Test Accuracy: 64.92%


# Bidirectional LSTM

In [None]:
#######        MODEL -1 Bidirectional LSTM with 50 neurons
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

model3= Sequential()
model3.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length,weights=[embedding_matrix],trainable=False))
#adding a LSTM layer of dim 1--
model3.add(Bidirectional(LSTM(50, return_sequences=True)))
model3.add(Bidirectional(LSTM(50, return_sequences=False)))
#adding a dense layer with activation function of relu
model3.add(Dense(50, activation='relu'))
model3.add(BatchNormalization())
#adding the final output activation with activation function of softmax
model3.add(Dense(6, activation='softmax'))
#model.add(LSTM(32))
#model.add(Dense(6, activation='softmax'))
model3.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model3.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 64, 100)           10808700  
                                                                 
 bidirectional (Bidirectiona  (None, 64, 100)          60400     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100)              60400     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 50)                5050      
                                                                 
 batch_normalization (BatchN  (None, 50)               200       
 ormalization)                                                   
                                                      

In [None]:
model3.fit(train_indices,train_y,batch_size=32,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f577f2a5350>

In [None]:
# Final evaluation of the model
scores_test = model3.evaluate(test_indices, test_y, verbose=0)
#scores_val = model3.evaluate(val_indices, val_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores_test[1]*100))
#rint("Val Accuracy: %.2f%%" % (scores_val[1]*100))

Test Accuracy: 64.69%


In [43]:
#######        MODEL -2 Bidirectional LSTM with 200 neurons
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

model5= Sequential()
model5.add(Embedding(input_dim=vocab,output_dim=200,input_length=max_length,weights=[embedding_matrix],trainable=False))
#adding a LSTM layer of dim 1--
model5.add(Bidirectional(LSTM(200, return_sequences=True)))
model5.add(Bidirectional(LSTM(200, return_sequences=False)))
#adding a dense layer with activation function of relu
model5.add(Dense(50, activation='relu'))
model5.add(BatchNormalization())
#adding the final output activation with activation function of softmax
model5.add(Dense(6, activation='softmax'))
model5.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model5.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 64, 200)           21617400  
                                                                 
 bidirectional (Bidirectiona  (None, 64, 400)          641600    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 400)              961600    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 50)                20050     
                                                                 
 batch_normalization (BatchN  (None, 50)               200       
 ormalization)                                                   
                                                      

In [44]:
model5.fit(train_indices,train_y,batch_size=32,epochs=5)
# Final evaluation of the model
scores_test5 = model5.evaluate(test_indices, test_y, verbose=0)
print("Test Accuracy: %.2f%%" % (scores_test5[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 65.83%
