# Train a CNN on Movie Reviews 

## Preprocessing input data

Let' start with some Imports

In [None]:
import string
from collections import Counter
from nltk.corpus import stopwords
from os import listdir
import nltk
nltk.download('stopwords')

Next, some methods for reading and cleaning the data

In [None]:
# a very simple plain text reader
def load_doc(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

Function for cleaning: includes tokenization, getting rid of punctuation, words that are not alphabetic, stopwords, and very short words

In [None]:
# text cleaning to prepare for further processing
def retrieve_cleaned_doc_tokens(doc):
	# whitespace splitting
	tokens = doc.split()
    # remove punctuation
	tokens = [w.translate(str.maketrans('','',string.punctuation)) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

Function to add a single document to overall vocabulary, pluging things together

In [None]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = retrieve_cleaned_doc_tokens(doc)
	# update counts
	vocab.update(tokens)

Function to process directory with text and add its content to the overall vocabulary

In [None]:
# load all docs in a directory
def process_docs(directory, vocab, is_train):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_train and filename.startswith('cv9'):
			continue
		if not is_train and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

This is where we'll actually start executing the code.
Might take a few seconds.

In [None]:
# collections.Counter is basically an enhanced version of a dict
vocab = Counter()
# add all docs to vocab
process_docs('txt_sentoken/neg', vocab, True)
process_docs('txt_sentoken/pos', vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

We want to store our vocabulary for later usage. Furthermore, words which appear infrequently, should be ignored.

In [None]:
# define how many time a word needs to appear in vocab
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

def save_list(lines, filename):
	# write vocab to a single string, line by line
	data = '\n'.join(lines)
	# prepare file to write to
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
    
# list vocabulary
save_list(tokens, 'vocab.txt')

## Playing with vocabulary

But first, a few more imports

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Load vocabulary for reference (remember, it is a simple line by line list of our words)

In [None]:
# load the vocabulary and create a set from it
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab_set = set(vocab)

We need to turn single reviews into correctly preprocessed strings (see above) and only keep known words

In [None]:
def retrieve_cleaned_doc_tokens_list(doc, vocab):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	tokens = [w.translate(str.maketrans('','',string.punctuation)) for w in tokens]
	# filter out tokens not in vocab
	tokens = [w for w in tokens if w in vocab]
	tokens = ' '.join(tokens)
	return tokens

And again, plug things together: load & process everything in a directory

In [None]:
# load all docs in a directory
def create_documents(directory, vocab, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = retrieve_cleaned_doc_tokens_list(doc, vocab)
		# add to list
		documents.append(tokens)
	return documents

Finally, load all reviews from the train set

In [None]:
# load all training reviews
positive_docs = create_documents('txt_sentoken/pos', vocab_set, True)
negative_docs = create_documents('txt_sentoken/neg', vocab_set, True)
train_docs = negative_docs + positive_docs

For the embedding layer in Keras, we need to turn tokens into integers. ``keras.preprocessing.text.Tokenizer`` can do this for us. First, we map words to integers. Second, we encode the reviews accordingly. 

In [None]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

Ensure reviews have the same length with zero-padding (up to the longest review)

In [None]:
max_length = max([len(s.split()) for s in train_docs])
# pad at the end
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

Now that we have training instances, we also need training labels.

In [None]:
# define training labels as a simple array with binary values
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

And the same thing for the test set

In [None]:
positive_docs = create_documents('txt_sentoken/pos', vocab, False)
negative_docs = create_documents('txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

## Define the model

First layer: embeddings

In [None]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

Finally, a full CNN

In [None]:
# using the Keras Sequential model
model = Sequential()
# embedding layer
model.add(Embedding(vocab_size, 100, input_length=max_length))
# conv layer
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
# pooling layer
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
# fully connected layer
model.add(Dense(10, activation='relu'))
# output
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Compile and fit to training data

In [None]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Finally, evaluate on test data

In [None]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=2)
print('Test Accuracy: %f' % (acc*100))

## Pre-trained embeddings

Next, we'll use unsupervised embeddings instead of learning them on the downstream task data

Start with a method to lead pretrained embedding from disk

In [None]:
# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = asarray(parts[1:], dtype='float32')
	return embedding

Next, prepare a mapping for task-specific vocab 

In [None]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab, dimension):
	# total vocabulary size plus 0 for unknown words
	vocab_size = len(vocab) + 1
	# define weight matrix dimensions with all 0
	weight_matrix = zeros((vocab_size, dimension))
	# step vocab, store vectors using the Tokenizer's integer mapping
	for word, i in vocab.items():
		vector = embedding.get(word)
		if vector is not None:
			weight_matrix[i] = vector
	return weight_matrix

Putting things together and prepare embedding layer

In [None]:
# load embedding from file
raw_embedding = load_embedding('glove.6B.50d.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index, 50)

print(len(tokenizer.word_index))
# create the embedding layer
print(max_length)
embedding_layer = Embedding(vocab_size, 50, weights=[embedding_vectors], input_length=max_length, trainable=False)

A CNN, same as above

In [None]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# define model
model_pretrained = Sequential()
model_pretrained.add(embedding_layer)
model_pretrained.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model_pretrained.add(MaxPooling1D(pool_size=2))
model_pretrained.add(Flatten())
model_pretrained.add(Dense(1, activation='sigmoid'))
print(model_pretrained.summary())
# compile network
model_pretrained.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model_pretrained.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model_pretrained.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))