# Convolutional Neural Network

# Import Statements

In [61]:
#import statements
import numpy as np
from pandas import DataFrame
import pandas as pd
from sklearn.cross_validation import train_test_split,KFold
from collections import defaultdict
from collections import Counter
import re
from scipy.special import comb
import itertools
from collections import Counter 
import theano
theano.config.mode = 'FAST_COMPILE'

# Keras import statements

In [62]:
from keras.models import Sequential, Graph
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

 # Read data section

In [63]:
#######################################################################
'''
Get the file in which the data is present and read the lines in the file.
'''
#######################################################################
def readFile(fileName):
    with open(fileName,'r') as f:
        lines = f.readlines()
    return lines

We used the labelled texts of the imdb data set to train and test our model.

In [64]:
lines_read = readFile('imdb_labelled.txt')

# Prepare and transform data section

In [65]:
###############################################################################################################################
'''
This function is used to obtain each sentence in the dataset and strip the unwanted characters that will not help much with the
classification.
For eg., we might not be interested in words that contain apostrophes 

'''
###############################################################################################################################
def stripnonalphanumeric(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [66]:
##############################################################################################################################
'''
This function is used to split the data based on the available classes. This function takes the text along with its
labels as the input since the processed data contains the text and the labels in tab separated format, we split the 
input into the text as well as labels using tab limitation.

Once the data is split, we add them to separate lists for future use.

'''
#############################################################################################################################
def split_data(sent):
    neg_sent=[]
    pos_sent=[]
    for s in sent:
        tab_sep_data = s.split('\t') #split the data
        if int(tab_sep_data[1]) == 0:
            neg_sent.append(tab_sep_data[0]) #negative sentiment sentences
        else:
            pos_sent.append(tab_sep_data[0]) #positive sentiment sentences
    return pos_sent,neg_sent

In [67]:
pos_sent,neg_sent=split_data(lines_read)

In [68]:
###############################################################################################################################
'''
This function is where the actual data is prepared for further processing. First let's combine differnt data into one, Once
combined, we call the stripalphanumeric function to get rid of the charactetrs that do not involve in the classification 
process. 

After this process, we split the text so as to get individual words and assign the labels to the sentences

'''
###############################################################################################################################
def prepare_data(pos_sent,neg_sent):
    x=pos_sent+neg_sent
    x=[stripnonalphanumeric(s) for s in x]
    x=[s.split(' ') for s in x]
    pos_lab = [[1,0] for p in pos_sent]
    neg_lab = [[0,1] for n in neg_sent]
    y=np.concatenate([pos_lab,neg_lab],0)
    return x,y

In [69]:
x,y=prepare_data(pos_sent,neg_sent)

In [70]:
#################################################################################################################################
'''
This function is used to make the sententes in the dataset into similar shape. This is how the function works. Get the max 
sentence length from the available sentences. If the length of the sentence that is taken into consideration is less than the 
max size then append the sentence with some 'fill' characteres. In our case we just used '!!FILL!!' to fill the gaps.

'''
################################################################################################################################

def pool_data(x):
    append_word = '!!FILL!!'
    max_sent_length = max(len(s) for s in x)
    created_data = []
    for i in range(len(x)):
        sent = x[i]
        fills = max_sent_length - len(sent)
        n_sent = sent + [append_word] *fills
        created_data.append(n_sent)
    return created_data

In [71]:
data=pool_data(x)

In [72]:
################################################################################################################################
'''
This function is used to create a dictionary for the available word in the dictionary. It takes all the sentences and runs a
counter over the sentences to determine the number of times a particular word appears in the whole dataset(term freuency)

Feature extraction is done by considering the most common words. For eg., a word appearing 100 times in a corpus is important
than words appearing once or twice in the document.

After creating the dictionary of words, we obtain teh indices of the words to create the feature vector.
'''
################################################################################################################################
def vocab_dict_builder(sent):
    w_count = Counter(itertools.chain(*sent))
    voc_dict = [x[0] for x in w_count.most_common()]
    voc_idx_map = {x: i for i, x in enumerate(voc_dict)}
    return [voc_dict,voc_idx_map]

In [73]:
words,words_idx=vocab_dict_builder(data)

In [74]:
################################################################################################################################
'''
Based on thw above, we try to generate the feature vector. For creating the feature vector, we employ two approaches. The first 
approach is to binarize the data, which signifies, if a word is pressent or not. 
The next approach is to obtain the max length of the sentence, append some arbitrary words, obtain its index and jsut mark the 
presence of the word just by marking its index to the corresponding word.

The latter approach is used here.
'''
################################################################################################################################
def feat_vec(data,words,words_idx):
    x=np.array([[words_idx[i] for i in d] for d in data])
    return x

In [75]:
x=feat_vec(data,words,words_idx)

In [76]:
'''
Randomly shuffle data
'''
shuffle = np.random.permutation(np.arange(len(y)))
x = x[shuffle]
y = y[shuffle].argmax(axis=1)


# Build the model

In [77]:
###############################################################################################################################
'''
The model that is built here is a simple RCNN model which is in correspondence to the one proposed in the paper. Here we use a 
CNN-rand varaint of the CNN model. The other CNN models are static and non-static. We took this model, since it is time
efficient.

Below are the model parameters
Reference: http://keras.io/getting-started/sequential-model-guide/
'''
################################################################################################################################

model = 'CNN-rand'
no_features = x.shape[1] #total number of features in each text
word_embed = 10 #word embedding size 
## following the above we will have an input matrix of size 81 X 10  
n_grams= [3, 4] #let us take this as the filter sizes
num_filters = 50 #number of filters to be applied to the convolution layer
## fixing the number of filters since no region size is specified, fixing the number of filters constant
dropouts = [0.1, 0.5] #drop out values
## drop out values are used at various output producing layers to prevent over-fitting
h_units = 11 #number of hidden units
iterations = 15 # number of iterations, training process should be repeated.
lstm_op=30
word_embeddings=None #since the model is cnn-rand no embedding such as Word2Vec has to be done explicitly

In [82]:
################################################################################################################################
'''
Here we use a graph object as a container to hold different models together. For eg., we use two filters which produces different
results. We add a max-pooling layer of length 2 to obtain a single dimensional vector. Flatten is used to combine them into  a 
single vector.
'''
################################################################################################################################
graph = Graph() #create a graph
graph.add_input(name='input', input_shape=(no_features, word_embed)) #add the inputs
for gram in n_grams:
    #create a one-D convolutional layer with the hyper parameters defined
    conv = Convolution1D(nb_filter=num_filters,filter_length=gram,border_mode='valid',activation='relu',subsample_length=1)
    #assign a pooling layer
    pool = MaxPooling1D(pool_length=2)
    #add the convolutional layer and the pooling layer to the graph
    graph.add_node(conv, name='C-%s' % gram, input='input')
    graph.add_node(pool, name='pool-%s' % gram, input='C-%s' % gram)
    #combine the pools
    graph.add_node(Flatten(), name='flatten-%s' % gram, input='pool-%s' % gram)
    #merge the outputs from different branches and merge them into one
if len(n_grams)>1:
    graph.add_output(name='output',inputs=['flatten-%s' % gr for gr in n_grams],merge_mode='concat')
else:                 
    graph.add_output(name='output', input='flatten-%s' % n_grams[0])

In [79]:
################################################################################################################################
'''
Using keras we are going to build a model that uses 1D convolution for the convolution model and Long Short Term Memory(LSTM) as
the recurrent neural model. The model that we are going to build has only one convolution  layer and one reccurent layer for 
simplicity and we do not have any hidden layers in between the two layers.


1) We define the model to be Sequential
2) Calculate the model weights by using the Embedding funtion which is built within the keras package
3) Assign drop out value to the layer.
4) Create a 1D Convolutional model with the specified hyper parameters
5) Below the convolutional model, we are going to add a layer of LSTM model which serves as the recurrent model
6) The output is obtained by applying the activation function


Here sigmoid is used as activation function, since the data has only two classes.

NOTE: The models are stacked one upon the other and not by having them as separate branches and merging them.
'''
################################################################################################################################

#selecting the model to be Sequential
model = Sequential()
#adding embedding layer to the model
model.add(Embedding(len(words), word_embed, input_length=no_features,weights=word_embeddings))
#assign drop out to the convolutional layer
model.add(Dropout(dropouts[0], input_shape=(no_features, word_embed)))
#add the realization built using graph to the model. Here we are trying to combine various realizations of the model together.
#for eg., we used different filter sizes, which provides various results
model.add(graph)
#adding the hidden units to the model
model.add(Dense(h_units))
#adda second dropout to the second layer's output
model.add(Dropout(dropouts[1]))
#using the sigmoid for activation
model.add(Activation('sigmoid'))
#adding the output layer
model.add(Dense(1))
#using the sigmoid for activation
model.add(Activation('sigmoid'))


In [80]:
################################################################################################################################
'''
This block is used to compile the model that we have built so far. There are various settings for the model. For eg., we have
different objectives such as MSE(Mean Suared Error), since the problem that we are dealing with is a 2-class problem, we used
binary crossentropy as the objective and we used stochastic gradient descent as the optimizer(other optimizers are rmsprop,
adagrad,adam,etc.,) and, with the metrics we just show the accuracy which returns a tuple containing the loss and accuracy.
We used the default setting for the SGD optimizer, since it performed well on the data and we do not have to customize the 
optimizer.
'''
#################################################################################################################################
model.compile(loss='binary_crossentropy', optimizer='rmsprop',metrics=["accuracy"])

In [81]:
################################################################################################################################
'''
Finally we fit the data after all the transformations we specify the batch size to be 10 and the numbr of times the data is to be
trained to 15. We do not explicitly use a predict or evaluate function, but we set the validation set to be 20% of the actual data
so by using the fit function itself, we can get both the training as well as the testing accuracy.
'''
################################################################################################################################
model.fit(x, y, batch_size=10,nb_epoch=iterations,validation_split=0.2, verbose=1)

Train on 1000 samples, validate on 200 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0xd9e5518>