In [18]:
import os
import numpy as np
data_path = 'txt_sentoken/'


### Data preprocessing

In [None]:
from string import punctuation
from nltk.corpus import stopwords
def load_doc(fpath):
    text = ''
    with open(fpath, 'r') as infile:
        text = infile.read()
    return text
def clean_text(text):
    tokens = text.split()
    table = str.maketrans('', '', punctuation) # remove punctuation
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if w.isalpha() and len(w) > 1] #remove numbers, 1 letter words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words] 
    return tokens


In [3]:
def update_vocab(fpath, vocab):
    text = load_doc(fpath)
    tokens = clean_text(text)
    vocab.update(tokens)

def build_vocab(dir_path, vocab, is_train):
    for fname in os.listdir(dir_path):
        if is_train and fname.startswith('cv9'):
            continue
        if not is_train and not fname.startswith('cv9'):
            continue
        fpath = dir_path + '/' + fname
        update_vocab(fpath, vocab)

def process_doc(dir_path, is_train):
    docs = []
    for fname in os.listdir(dir_path):
        if is_train and fname.startswith('cv9'):
            continue
        if not is_train and not fname.startswith('cv9'):
            continue
        fpath = dir_path + '/' + fname
        text = load_doc(fpath)
        tokens = clean_text(text)
        docs.append(' '.join(tokens))
    return docs


In [7]:
def list_to_file(fpath, outlist):
    with open(fpath, 'w') as outfile:
        if type(outlist[0]) == str:
            data = '\n'.join(outlist)
        else:
            data = '\n'.join(['\t'.join(item) for item in outlist])
        outfile.write(data)
           

In [8]:
from collections import Counter
vocab = Counter()
build_vocab(data_path+'pos', vocab, is_train=True)
print(len(vocab))
build_vocab(data_path+ 'neg', vocab, is_train=True)
print(len(vocab))

32487
44276


In [9]:
print(vocab.most_common(10))

[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844)]


In [10]:
#filter words occurring less than n = 1 times
min_count = 2
vocab_tokens = [k for k,v in vocab.items() if v >= min_count]
print(len(vocab_tokens))
#save the vocabulary
list_to_file('vocabulary.txt', vocab_tokens)

25767


In [11]:
pos_docs = process_doc(data_path+'pos', is_train=True)
print(len(pos_docs))
neg_docs = process_doc(data_path+'neg', is_train=True)
print(len(neg_docs))
train_docs = pos_docs + neg_docs
len(train_docs)

900
900


1800

In [None]:
for doc

In [15]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)

SyntaxError: invalid syntax (<ipython-input-15-a293dc8f2b09>, line 1)

In [13]:
train_docs_encd = tokenizer.texts_to_sequences(train_docs)


In [29]:
from keras.preprocessing.sequence import pad_sequences
max_len = max([len(d.split()) for d in train_docs])
X_train = pad_sequences(train_docs_encd, maxlen=max_len, padding='post')
print(X_train[0])

1380


In [19]:
y_train = np.array([1 for i in range(900)] + [0 for i in range(900)])
print(len(y_train))

1800


In [22]:
#process test data
pos_docs1 = process_doc(data_path+'pos/', is_train=False)
print(len(pos_docs1))
neg_docs1 = process_doc(data_path+'neg/', is_train=False)
print(len(neg_docs1))
test_docs = pos_docs1 + neg_docs1
print(len(test_docs))
test_docs_encd = tokenizer.texts_to_sequences(test_docs)
X_test = pad_sequences(test_docs_encd, maxlen=max_len, padding='post')
print(len(X_test))
y_test = np.array([1 for i in range(len(pos_docs1))] + [0 for i in range(len(neg_docs1))])
print(y_test.shape)

100
100
200
200
(200,)


In [24]:
vocab_size = len(tokenizer.word_index) + 1 # additional 1 for unknown words
vocab_size

44277

### Model

In [25]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, MaxPool1D,Flatten


In [27]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPool1D(pool_size=2)) #reduce the size of conv layer by half
model.add(Flatten()) 
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary(70))

______________________________________________________________________
Layer (type)                   Output Shape                Param #    
embedding_2 (Embedding)        (None, 1380, 100)           4427700    
______________________________________________________________________
conv1d_2 (Conv1D)              (None, 1373, 32)            25632      
______________________________________________________________________
max_pooling1d_2 (MaxPooling1D) (None, 686, 32)             0          
______________________________________________________________________
flatten_2 (Flatten)            (None, 21952)               0          
______________________________________________________________________
dense_3 (Dense)                (None, 10)                  219530     
______________________________________________________________________
dense_4 (Dense)                (None, 1)                   11         
Total params: 4,672,873
Trainable params: 4,672,873
Non-trainable params: 0
_

In [31]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
model.fit(X_train, y_train, epochs=5, verbose=2)

Epoch 1/5
36s - loss: 0.6908 - acc: 0.5261
Epoch 2/5
22s - loss: 0.5670 - acc: 0.7483
Epoch 3/5
21s - loss: 0.1268 - acc: 0.9939
Epoch 4/5
21s - loss: 0.0060 - acc: 1.0000
Epoch 5/5
21s - loss: 0.0023 - acc: 1.0000


<keras.callbacks.History at 0x7f272db69898>

In [33]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Test loss = ", loss)
print("Test accuracy = ", accuracy*100)

Test loss =  0.438384304047
Test accuracy =  82.0
