# Train a CNN on Movie Reviews 

## Preprocessing input data

Let' start with some Imports

In [None]:
from string import punctuation
from collections import Counter
from nltk.corpus import stopwords
from os import listdir
import nltk
nltk.download('stopwords')

Next, some methods for reading and cleaning the data

In [None]:
# a very simple plain text reader
def load_doc(filename):
	# open, read and close the file
    ...
	return text

Function for cleaning: includes tokenization, getting rid of punctuation, words that are not alphabetic, stopwords, and very short words

In [None]:
# text cleaning to prepare for further processing
def retrieve_cleaned_doc_tokens(doc):
	# whitespace splitting
	...
    # remove punctuation
	tokens = [w.translate(str.maketrans('','',string.punctuation)) for w in tokens]
	# remove remaining tokens that are not alphabetic
	...
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	...
	# filter out short tokens
	...
	return tokens

Function to add a single document to overall vocabulary, pluging things together

In [None]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = retrieve_cleaned_doc_tokens(doc)
	# update counts
	vocab.update(tokens)

Function to process directory with text and add its content to the overall vocabulary

In [None]:
# load all docs in a directory
def process_docs(directory, vocab, is_train):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		...
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

This is where we'll actually start executing the code.
Might take a few seconds.

In [None]:
# collections.Counter is basically an enhanced version of a dict
vocab = Counter()
# add all docs to vocab
...
...
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

We want to store our vocabulary for later usage. Furthermore, words which appear infrequently, should be ignored.

In [None]:
# define how many time a word needs to appear in vocab
min_occurane = 2
...
print(len(tokens))

def save_list(lines, filename):
	# write vocab to a single string, line by line
	...
	# prepare file to write to
	file = open(filename, 'w')
	# write text & close file
	...
    
# save vocabulary
save_list(tokens, 'vocab.txt')

## Playing with vocabulary

But first, a few more imports

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Load vocabulary for reference (remember, it is a simple line by line list of our words)

In [None]:
# load the vocabulary and create a set from it
vocab_filename = 'vocab.txt'
...

We need to turn single reviews into correctly preprocessed strings (see above) and only keep known words

In [None]:
def retrieve_cleaned_doc_tokens_list(doc, vocab):
	# split into tokens by white space
	...
	# remove punctuation from each token
	...
	# filter out tokens not in vocab
	...
	return tokens

And again, plug things together: load & process everything in a directory

In [None]:
# load all docs in a directory
def create_documents(directory, vocab, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		...
		# create the full path of the file to open
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = retrieve_cleaned_doc_tokens_list(doc, vocab)
		# add to list
		documents.append(tokens)
	return documents

Finally, load all reviews from the train set

In [None]:
# load all training reviews
positive_docs = ...
negative_docs = ...
train_docs = ...

For the embedding layer in Keras, we need to turn tokens into integers. ``keras.preprocessing.text.Tokenizer`` can do this for us. First, we map words to integers. Second, we encode the reviews accordingly. 

In [None]:
# create the tokenizer
...
# fit the tokenizer on the documents
...
# sequence encode
...

Ensure reviews have the same length with zero-padding (up to the longest review)

In [None]:
max_length = ...
# pad at the end
Xtrain = ...

Now that we have training instances, we also need training labels.

In [None]:
# define training labels as a simple array with binary values
ytrain = ...

And the same thing for the test set

In [None]:
positive_docs = ...
negative_docs = ...
test_docs = ...
# sequence encode
...
# pad sequences
Xtest = ...
# define test labels
ytest = ...

## Define the model

First layer: embeddings

In [None]:
# define vocabulary size + 1 for unknown words
...
print(vocab_size)

Finally, a full CNN

In [None]:
# using the Keras Sequential model
model = Sequential()
# embedding layer
...
# conv layer
...
# pooling layer + flatten
...
...
# fully connected layer
...
# output
...
print(model.summary())

Compile and fit to training data

In [None]:
# compile network
...
# fit network
...

Finally, evaluate on test data

In [None]:
# evaluate
...
print('Test Accuracy: %f' % (acc*100))

## Pre-trained embeddings

Next, we'll use unsupervised embeddings instead of learning them on the downstream task data

Start with a method to lead pretrained embedding from disk

In [None]:
# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for the vector
		....
	return embedding

Next, prepare a mapping for task-specific vocab 

In [None]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab, dimension):
	# total vocabulary size, add '0' for unknown words
	...
	# define weight matrix dimensions with all 0
	...
	# go over vocab and add vectors from embedding to matrix
	for word, i in vocab.items():
		...
		if vector is not None:
			weight_matrix[i] = vector
	return weight_matrix

Putting things together and prepare embedding layer

In [None]:
# load embedding from file
raw_embedding = ...
# get vectors in the right order
embedding_vectors = ...

print(len(tokenizer.word_index))
# create the embedding layer
print(max_length)
embedding_layer = ...

A CNN, same as above

In [None]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# define model
...
print(model_pretrained.summary())
# compile network
...
# fit network
...
# evaluate
...
print('Test Accuracy: %f' % (acc*100))