# Sentiment Analysis with Deep Learning

# Phase 2- Modelling

This notebook consists the functions and code for modelling.  

### CHRISP-DM phases

Modelling and Evaluation phases for CRISP-DM can be found in this noteboook.

#### 4.Modeling
Modeling techniques are selected and applied. 

#### 5.Evaluation
Once one or more models have been built that appear to have high quality based on whichever loss functions have been selected, these need to be tested to ensure they generalize against unseen data and that all key business issues have been sufficiently considered.  The end result is the selection of the champion model(s).


### Table of Contents

- 1. Import Libraries
- 2. Define Functions 
- 3. Modeling With Neural Networks  
- 4. Tuning the Best Model

## 1. Import Libraries

In [1]:
import pandas as pd
import gc
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
import nltk
from nltk import word_tokenize, FreqDist
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
np.random.seed(0)
import pickle

In [2]:
from keras.models import Model, Sequential, Input
from keras.layers import Dense, Embedding, Input, Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D, Dropout, concatenate, Layer, InputSpec, CuDNNLSTM, SpatialDropout1D, Activation, LSTM
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import activations, initializers, regularizers, constraints, optimizers, layers
from keras.utils.conv_utils import conv_output_length
from keras.regularizers import l2
from keras.constraints import maxnorm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, SpatialDropout1D, Activation
from keras.layers import Conv1D, Bidirectional, GlobalMaxPool1D, MaxPooling1D, BatchNormalization
from keras.optimizers import Adam, SGD
import pickle

Using TensorFlow backend.


## 2. Functions 

In [3]:
# number of unique words we want to use (or: number of rows in incoming embedding vector)
max_features = 8192
# max number of words in a comment to use (or: number of columns in incoming embedding vector)
max_len = 128
# dimension of the embedding variable (or: number of rows in output of embedding vector)
embedding_dims = 64

In [12]:
def load_smalldata ( ):
    # Loading partial train test files , tokenized, sequenzed and padded
    pickle_in = open("data/vectors_small/X_train2_file.pickle","rb")
    X_train2 = pickle.load(pickle_in)

    pickle_in = open("data/vectors_small/X_test2_file.pickle","rb")
    X_test2 = pickle.load(pickle_in)

    pickle_in = open("data/vectors_small/y_train2_file.pickle","rb")
    y_train2 = pickle.load(pickle_in)

    pickle_in = open("data/vectors_small/y_test2_file.pickle","rb")
    y_test2 = pickle.load(pickle_in)
    
    return X_train2, X_test2, y_train2, y_test2


def load_tokenizer():
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer=pickle.load(handle)
        return tokenizer
    
def create_model ( hidden_layers, 
                  loss='binary_crossentropy',
                  optimizer=Adam(0.01),
                  metrics=['accuracy'],
                  embedding_matrix=None,
                  max_len=max_len,
                  embedding_dims=embedding_dims,
                  max_features=max_features,
                  glove=False,
                 ):
 
    # check if embedding matrix has assigned which means the model uses glove embeddings 
    if glove==False:
        emb_layer=Embedding(input_dim=max_features, input_length=max_len,
                        output_dim=embedding_dims)
    else:
        
        emb_layer=Embedding(input_dim =embedding_matrix.shape[0], input_length=max_len,
                          output_dim=embedding_matrix.shape[1], 
                          weights=[embedding_matrix], trainable=False)
    
    # instantiate Sequential model
    model = Sequential()
 
    # add embedding layer with defined parameters
    model.add(emb_layer)
   
    # add hidden layers available in hidden_layers list
    for layer in hidden_layers:
        model.add(layer)
    
    # add pooling layer 
    model.add(GlobalMaxPool1D())

    # set the dropout layer to drop out 50% of the nodes
    model.add(Dropout(0.5))

    # add dense layer to produce an output dimension of 50 and using relu activation
    model.add(Dense(50, activation='relu'))

    # finally add a dense layer
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=metrics)
    
    model.summary() 
    return model


def run_model(model, model_name, results, epochs, batch_size=32):
    print(epochs)
    hist = model.fit(X_train2, y_train2, 
                     batch_size=batch_size, 
                     epochs=epochs, 
                     validation_split=0.1)

    test_loss, test_auc = model.evaluate(X_test2, y_test2, batch_size=32)
    print("-------------------------------------------")
    print("")
    print("")
    print(model_name + ' Test Loss:    ', test_loss)
    print(model_name + ' Test Accuracy:', test_auc)
    print("")
    print("")
    print("")
    
    
    #Pass the results as key value pairs to append() function 
    row=[]
    row =[model_name , hist.history['accuracy'][-1],
          hist.history['val_accuracy'][-1],test_auc, test_loss] 

    save_model(model, model_name)
    return row



def save_model (model, model_name):
    path="models/"+model_name + ".h5"
    model.save(path)

def save_results (results, row):
    #Pass the results as key value pairs to append() function 
    results = results.append({'model' : row[0] , 
                        'train_acc' : row[1], 
                        'val_acc':row[2],
                        'test_acc': row[3]
                              
                                    } , ignore_index=True)
    return results


### Loading Glove Dictionary
def load_glove (path):
    # load the glove840B embedding
    embeddings_index = dict()
    f = open(path)

    for line in f:
        # Note: use split(' ') instead of split() if you get an error
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Loaded %s word vectors.' % len(embeddings_index))

    # create a weight matrix
    embedding_matrix = np.zeros((len(tokenizer.word_index)+1, 300))

    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


## 3. Modeling With Neural Networks                       

I will first try 4 different models to see which one gives the best result.

#### Set parameters for modeling

#### Create a dataframe to store the accuarecy results

In [5]:
results=pd.DataFrame(columns=["model", "train_acc","val_acc" ,"test_acc"])

In [13]:
#load small dataset to train and test the models
X_train2, X_test2, y_train2, y_test2=load_smalldata()
#load tokenizer
tokenizer=load_tokenizer()

#### Define parameters for each model such as hidden layers and glove

In [8]:
dnn_params={'hidden_layers':[],
            'glove' : False,
            'embedding_matrix':None
           }


cnn_params = {
        'hidden_layers':[SpatialDropout1D(0.5),
                   Conv1D(filters=100, kernel_size=4, padding='same', activation='relu'),
                   BatchNormalization()],
        'glove' : False,
        'embedding_matrix':None}


rnn_params ={ 
     'hidden_layers':[SpatialDropout1D(0.5),
                  Bidirectional(LSTM(25, 
                  return_sequences=True))],
     'glove' : False,
     'embedding_matrix':None}

#load glove embedding vectors from txt file
embedding_matrix=load_glove ("glove.6B.300d.txt")

cnn_glove_params ={
    'hidden_layers':[SpatialDropout1D(0.5),
                   Conv1D(filters=100, kernel_size=4, padding='same', activation='relu'),
                   BatchNormalization()],
    'glove' : True,
    
    'embedding_matrix':embedding_matrix
    }

rnn_glove_params ={
    'hidden_layers':[SpatialDropout1D(0.5),
                  Bidirectional(LSTM(25, 
                  return_sequences=True))],
    'glove' : True,
    'embedding_matrix':embedding_matrix
    }


Loaded 400000 word vectors.


In [11]:
# Create lists for model names and hiden layer
names=["dnn","cnn", "rnn", "cnn_glove", "rnn_glove"]
params=[dnn_params, cnn_params, rnn_params, 
        cnn_glove_params, rnn_glove_params]
epochs=1

#### For loop for modelling

Run a for loop to create the model, compile, fit and save results and the model itsef

In [None]:
#set epochs to 3
#Run a for loop to create the model, compile, fit and save results 
#and save the model to models folder
for name, param in zip(names, params):
    model=create_model(hidden_layers=param['hidden_layers'], 
                   glove=param['glove'],
                   embedding_matrix=param['embedding_matrix']
                    )
    row=run_model(model, name, results, epochs)
    results=save_results(results, row)


## Checking results and picking the model for tuning


In [37]:
results

Unnamed: 0,model,train_acc,val_acc,test_acc
0,dnn,0.869548,0.876733,0.875133
1,cnn,0.499652,0.500067,0.503233
2,rnn,0.85943,0.8828,0.879867
3,cnn_glove,0.859741,0.884067,0.889767
4,rnn_glove,0.823593,0.861,0.865367


Based on this results CNN with Glove Embedding is the best performing model so far. From this forward, I will tune only this model and try to impove its performance. 

## 4. Tuning the Best Model

## Iteration 1:  CNN with SGD optimizer

I will use a different optimizer for the same model structure. I will use SGD.  

In [39]:
sgd = optimizers.SGD(lr=0.05, decay=1e-6, momentum=0.5, nesterov=True)

In [41]:
# iteration 1 = with sgd optimizer
model=create_model(hidden_layers=[SpatialDropout1D(0.5),
                        Conv1D(filters=100,kernel_size=4, padding='same', activation='relu'),
                        BatchNormalization()],
                   
                      glove = True,
                      embedding_matrix=embedding_matrix,
                      optimizer = sgd)
row=run_model(model, "cnn_glove_sgd", results, epochs)

results=save_results(results, row)
 

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 128, 300)          264087600 
_________________________________________________________________
spatial_dropout1d_15 (Spatia (None, 128, 300)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 128, 100)          120100    
_________________________________________________________________
batch_normalization_7 (Batch (None, 128, 100)          400       
_________________________________________________________________
global_max_pooling1d_22 (Glo (None, 100)               0         
_________________________________________________________________
dropout_22 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_43 (Dense)             (None, 50)              

SGD optimizer seems slightly better performing than "adam". So lets use SGD.  

## Iteration 2:  CNN with 2 convolution layers


In [42]:
# iteration 1 = with sgd optimizer
model=create_model(hidden_layers=[SpatialDropout1D(0.5),
                        Conv1D(filters=100,kernel_size=4, padding='same', activation='relu'),
                        BatchNormalization(),
                        Conv1D(filters=100,kernel_size=4, padding='same', activation='relu'),
                        BatchNormalization()],
                        glove = True,
                        embedding_matrix=embedding_matrix,
                      )
row=run_model(model, "cnn_glove_2cnv", results)

results=save_results(results, row)
 

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 128, 300)          264087600 
_________________________________________________________________
spatial_dropout1d_16 (Spatia (None, 128, 300)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 128, 100)          120100    
_________________________________________________________________
batch_normalization_8 (Batch (None, 128, 100)          400       
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 128, 100)          40100     
_________________________________________________________________
batch_normalization_9 (Batch (None, 128, 100)          400       
_________________________________________________________________
global_max_pooling1d_23 (Glo (None, 100)             

Adding a 2nd convolution layer did not improved the model. 

### Iteration 3:  CNN_Glove  with 10 epochs

In [29]:
# iteration 1 = with sgd optimizer
model=create_model(hidden_layers=[SpatialDropout1D(0.5),
                        Conv1D(filters=100,kernel_size=4, padding='same', activation='relu'),
                        BatchNormalization()],
                      glove = True,
                      embedding_matrix=embedding_matrix)
epochs=8
row=run_model(model, "cnn_glove_8epochs", results, epochs)

results=save_results(results, row)
 

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 128, 300)          17691000  
_________________________________________________________________
spatial_dropout1d_15 (Spatia (None, 128, 300)          0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 128, 100)          120100    
_________________________________________________________________
batch_normalization_13 (Batc (None, 128, 100)          400       
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 100)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 50)              

KeyboardInterrupt: 

In [None]:
results