# N-gram Multichannel Convolutional Neural Networks for Sentiment Analysis


## Data Preparation

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
import os
from pickle import dump

[nltk_data] Downloading package stopwords to /home/gaurav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading and Cleaning Reviews

In [2]:
# load doc
def load_doc(filename):
    # opening the file
    file = open(filename, 'r')
    # real all text
    text = file.read()
    # closing the file
    file.close()
    return text

# turning a document into clean tokens
def clean_doc(doc):
    # split into tokens by the white spaces
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('','', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word)>1]
    tokens = ' '.join(tokens)
    return tokens

def process_docs(directory, is_train):
    documents = list()
    # walking through all the files in the folder
    for filename in os.listdir(directory):
        # skip any review in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory +'/'+ filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens= clean_doc(doc)
        # add to list
        documents.append(tokens)
    return documents

# save a dataset to file
def save_dataset(dataset, filename):
    dump(dataset, open(filename,'wb'))
    print('Saved: %s' % filename)
    
    
# loading all training reviews
negative_docs = process_docs('review_polarity/txt_sentoken/neg', True)
positive_docs = process_docs('review_polarity/txt_sentoken/pos', True)
trainX = negative_docs + positive_docs
trainy  = [0 for _ in range(900)]+[1 for _ in range(900)]
save_dataset([trainX, trainy], 'train.pkl')

# loading all test reviews
negative_docs = process_docs('review_polarity/txt_sentoken/neg', False)
positive_docs = process_docs('review_polarity/txt_sentoken/pos', False)
testX = negative_docs + positive_docs
testy = [0 for _ in range(100)] + [1 for _ in range(100)]
save_dataset([testX,testy],'test.pkl')



Saved: train.pkl
Saved: test.pkl


## Defining the Model

In [3]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
# load a clean dataset
def load_dataset(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])
# encode a list of lines
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

def define_model(length, vocab_size):
    inputs = Input(shape=(length,))
    # channel 1 
    embedding1 = Embedding(vocab_size, 100)(inputs)
    sdrop1 = SpatialDropout1D(0.1)(embedding1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(sdrop1)
    drop1 = Dropout(0.7)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    embedding2 = Embedding(vocab_size, 100)(inputs)
    sdrop2 = SpatialDropout1D(0.1)(embedding2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(sdrop2)
    drop2 = Dropout(0.7)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    embedding3 = Embedding(vocab_size, 100)(inputs)
    sdrop3 = SpatialDropout1D(0.1)(embedding3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(sdrop3)
    drop3 = Dropout(0.7)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    merge = concatenate([flat1,flat2,flat3])
    
    # interpretation
    dense1 = Dense(10, activation='relu')(merge)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs], outputs=outputs)
    # compile 
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

# load training dataset
trainLines, trainLabels = load_dataset('train.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)
 
# define model
model = define_model(length, vocab_size)
# fit model
model.fit(trainX, array(trainLabels), epochs=4, batch_size=16)
# save the model
model.save('model.h5')

Max document length: 1380
Vocabulary size: 44277
(1800, 1380)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1380)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1380, 100)    4427700     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1380, 100)    4427700     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1380, 100)    4427700     input_1[0][0]                    
_______________________________________________

## Evaluating Model

In [5]:
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')

tokenizer = create_tokenizer(trainLines)
length = max_length(trainLines)

vocab_size = len(tokenizer.word_index)+1
print('Max document length: %d' % length)
print('Vocabulary size : %d'.format(vocab_size))

trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape, testX.shape)


Max document length: 1380
Vocabulary size : %d
(1800, 1380) (200, 1380)


In [6]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
 
# load a clean dataset
def load_dataset(filename):
	return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
 
# calculate the maximum document length
def max_length(lines):
	return max([len(s.split()) for s in lines])
 
# encode a list of lines
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.texts_to_sequences(lines)
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded
 
# load datasets
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')
 
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape, testX.shape)
 
# load the model
model = load_model('model.h5')
 
# evaluate model on training dataset
loss, acc = model.evaluate(trainX, array(trainLabels), verbose=0)
print('Train Accuracy: %f' % (acc*100))
 
# evaluate model on test dataset dataset
loss, acc = model.evaluate(testX,array(testLabels), verbose=0)
print('Test Accuracy: %f' % (acc*100))

Max document length: 1380
Vocabulary size: 44277
(1800, 1380) (200, 1380)
Train Accuracy: 100.000000
Test Accuracy: 85.000000
