# Baseline Neural Network

In [1]:
import pandas as pd
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.models import Model, Input, Sequential
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, SpatialDropout1D, Activation
from keras.layers import Conv1D, Bidirectional, GlobalMaxPool1D, MaxPooling1D, BatchNormalization
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
in_folder='data/3-processed_data'
out_folder='models'

In [3]:
X_train=pd.read_csv(in_folder+'/X_train.csv')
Y_train=pd.read_csv(in_folder+'/Y_train.csv')
X_test=pd.read_csv(in_folder+'/X_test.csv')
Y_test=pd.read_csv(in_folder+'/Y_test.csv')

In [4]:
# number of unique words we want to use (or: number of rows in incoming embedding vector)
max_features = 20000 

# max number of words in a comment to use (or: number of columns in incoming embedding vector)
max_len = 200 

# dimension of the embedding variable (or: number of rows in output of embedding vector)
embedding_dims = 128

In [8]:
# instantiate NN model
base_model = Sequential()

# add embedding layer 
base_model.add(Embedding(input_dim=max_features, input_length=max_len,
                         output_dim=embedding_dims))

# add pooling layer 
# ... which will extract features from the embeddings of all words in the comment
base_model.add(GlobalMaxPool1D())

# add dense layer to produce an output dimension of 50 and apply relu activation
base_model.add(Dense(50, activation='relu'))

# set the regularizing dropout layer to drop out 30% of the nodes
base_model.add(Dropout(0.3))

# finally add a dense layer
# ... which projects output into six units and squash it with sigmoid activation
base_model.add(Dense(7, activation='sigmoid'))

In [9]:
base_model.compile(loss='binary_crossentropy',
                   optimizer=Adam(0.01), metrics=['accuracy'])

# check the model with all our layers
base_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                6450      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 7)                 357       
Total params: 2,566,807
Trainable params: 2,566,807
Non-trainable params: 0
_________________________________________________________________


In [10]:
base_hist = base_model.fit(X_train, Y_train, batch_size=128, 
                           epochs=5, validation_split=0.2)

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
base_test_loss, base_test_acc = base_model.evaluate(X_test, Y_test, batch_size=32)
print('Test Loss:    ', base_test_loss)
print('Test Accuracy:', base_test_acc)

Test Loss:     0.11250046690034898
Test Accuracy: 0.9534840224969718


## Saving the Model

In [13]:
#Saving Model
base_model.save(out_folder+'/bnn_model.h5')

In [None]:
#Saving performance
models_performance=pd.DataFrame(columns=['Model Name', 'Test Loss', 'Test Accuracy'])
stats=['Baseline Model', base_test_loss,base_test_acc]
models_performance.loc[len(models_performance),:]=stats
models_performance.to_csv(out_folder+'/models_performance.csv',index=False)