# Recurrent Neural Network

In [1]:
import pandas as pd
import itertools
import h5py
import pickle
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.models import Model, Input, Sequential
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, SpatialDropout1D, Activation
from keras.layers import Conv1D, Bidirectional, GlobalMaxPool1D, MaxPooling1D, BatchNormalization
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
in_folder='data/3-processed_data'
out_folder='models'

In [3]:
with open(in_folder+'/tokenizer.pickle', 'rb') as t:
    tokenizer = pickle.load(t)

In [4]:
X_train=pd.read_csv(in_folder+'/X_train.csv')
Y_train=pd.read_csv(in_folder+'/Y_train.csv')
X_test=pd.read_csv(in_folder+'/X_test.csv')
Y_test=pd.read_csv(in_folder+'/Y_test.csv')
labels=list(Y_test.columns)

In [7]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0,0,0,0,0,0,0,0,...,39,29,144,74,3474,90,3075,4630,2284,985
1,0,0,0,0,0,0,0,0,0,0,...,2699,22,95,39,807,2688,986,589,8493,183
2,0,0,0,0,0,0,0,0,0,0,...,3,436,58,36,1,2402,94,1,733,468
3,0,0,0,0,0,0,0,0,0,0,...,11,1,499,639,3594,31,99,24,3557,4973
4,0,0,0,0,0,0,0,0,0,0,...,28,3595,55,1064,7,580,40,29,205,16
5,0,0,0,0,0,0,0,0,0,0,...,32,37,18,101,81,1,2239,101,13347,39
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1884,148,7,3487,323,16,28,142
7,0,0,0,0,0,0,0,0,0,0,...,46,60,34,12,126,26,7,45,17,652
8,0,0,0,0,0,0,0,0,0,0,...,160,323,22,3578,11,13917,3475,4601,3916,4931
9,0,0,0,0,0,0,0,0,0,0,...,16,14,240,5,54,20,1812,3,143,4


In [5]:
# number of unique words we want to use (or: number of rows in incoming embedding vector)
max_features = 20000 

# max number of words in a comment to use (or: number of columns in incoming embedding vector)
max_len = 200 

# dimension of the embedding variable (or: number of rows in output of embedding vector)
embedding_dims = 128

In [6]:
# instantiate RNN model
rnn_model = Sequential()

# add embedding layer 
rnn_model.add(Embedding(input_dim=max_features, input_length=max_len,
                        output_dim=embedding_dims))

# set the dropout layer to drop out 50% of the nodes
rnn_model.add(SpatialDropout1D(0.5))

# add bidirectional layer and pass in an LSTM()
rnn_model.add(Bidirectional(LSTM(25, return_sequences=True)))

# add normalization layer
rnn_model.add(BatchNormalization())

# add pooling layer 
rnn_model.add(GlobalMaxPool1D())

# set the dropout layer to drop out 50% of the nodes
rnn_model.add(Dropout(0.5))

# add dense layer to produce an output dimension of 50 and using relu activation
rnn_model.add(Dense(50, activation='relu'))

# finally add a dense layer
rnn_model.add(Dense(7, activation='sigmoid'))

In [7]:
rnn_model.compile(loss='binary_crossentropy',
                  optimizer=Adam(0.01),
                  metrics=['accuracy'])

rnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 50)           30800     
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 50)           200       
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
__________

In [8]:
rnn_hist = rnn_model.fit(X_train, Y_train, batch_size=128, 
                          epochs=5, validation_split=0.2)

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
rnn_test_loss, rnn_test_acc = rnn_model.evaluate(X_test, Y_test, batch_size=128)
print('Test Loss:    ', rnn_test_loss)
print('Test Accuracy:', rnn_test_acc)

Test Loss:     0.08233811530550131
Test Accuracy: 0.9668189548902771


## Saving the Model

In [12]:
#Saving Model
rnn_model.save(out_folder+'/rnn_model.h5')

In [11]:
#Saving Performance
models_performance=pd.read_csv('models/models_performance.csv')
stats=['Recurrent Model', rnn_test_loss,rnn_test_acc]
models_performance.loc[len(models_performance),:]=stats
models_performance.to_csv(out_folder+'/models_performance.csv',index=False)