In [9]:
import pandas as pd
import numpy as np
import nltk
import re
import string

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

#deep learning library
from keras.models import *
from keras.layers import *
from keras.callbacks import *

from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('train_data.csv')
train_data.drop('Unnamed: 0', axis = 1, inplace = True)

test_data = pd.read_csv('test_data.csv')
test_data.drop('Unnamed: 0', axis = 1, inplace = True)

In [3]:
xtrain = train_data.Cleaned_Review.astype(str)
y_train = train_data.Category

xtest = test_data.Cleaned_Review.astype(str)
y_test = test_data.Category

# LSTM with GloVe Embedding¶

In [167]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(xtrain))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(xtrain) 
x_val_seq = tokenizer.texts_to_sequences(xtest)

#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)

size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

15827


In [168]:
embeddings_index = dict()
f = open('glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((size_of_vocabulary, 300))

Loaded 400000 word vectors.


In [169]:
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [171]:
model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,weights=[embedding_matrix],input_length=100,trainable=False)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.3))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 300)          4748100   
_________________________________________________________________
lstm_8 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 65        
Total params: 4,976,069
Trainable params: 227,969
Non-trainable params: 4,748,100
_________________________________________________________________
None


In [172]:
history = model.fit(np.array(x_tr_seq),np.array(y_train),batch_size=128,epochs=10,
                    validation_split=0.2,verbose=1,
                    callbacks=[es,mc])

Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.69246, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.69246
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.69246
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.69246
Epoch 5/10

Epoch 00005: val_acc improved from 0.69246 to 0.70957, saving model to best_model.h5
Epoch 6/10

Epoch 00006: val_acc improved from 0.70957 to 0.75127, saving model to best_model.h5
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.75127
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.75127
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.75127
Epoch 00009: early stopping


In [173]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

y_pred = model.predict_classes(x_val_seq)


_,val_acc = model.evaluate(x_val_seq,y_test, batch_size=128)
print(val_acc)

from sklearn.metrics import classification_report
# Creating classification report 
print(classification_report(y_test, y_pred))

0.7945454716682434
              precision    recall  f1-score   support

           0       0.94      0.77      0.85      1212
           1       0.58      0.87      0.69       438

    accuracy                           0.79      1650
   macro avg       0.76      0.82      0.77      1650
weighted avg       0.84      0.79      0.80      1650

