In [31]:
import string 
import re 
from os import listdir 
from numpy import array 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.utils.vis_utils import plot_model 
from keras.models import Sequential 
from keras.layers import Dense 
from keras.layers import Flatten 
from keras.layers import Embedding 
from keras.layers.convolutional import Conv1D 
from keras.layers.convolutional import MaxPooling1D
from nltk.corpus import stopwords
from collections import Counter

In [2]:
def load_doc(filename):
    f = open(filename, 'r')
    text = f.read()
    f.close()
    return text

In [3]:
def process_docs(directory,vocab):
    for fn in listdir(directory):
        if not fn.endswith('.txt'):
            next
        path = directory + '/' + fn
        add_doc_to_vocab(path, vocab)

In [4]:
def process_docs_after_vocab(directory, vocab, istrain):
    lines=list()
    for fn in listdir(directory):
        if istrain and fn.startswith('cv9' ):
            continue
        if not istrain and not fn.startswith('cv9' ):
            continue
        path = directory + '/' + fn
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

In [5]:
def doc_to_line(fn, vocab):
    doc = load_doc(fn)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [6]:
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile(' [%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub(' ' , w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [7]:
def add_doc_to_vocab(fn, vocab):
    doc = load_doc(fn)
    tokens  = clean_doc(doc)
    vocab.update(tokens)

In [8]:
def save_list(lines, fn):
    data = '\n'.join(lines)
    file = open(fn,'w')
    file.write(data)
    file.close()

In [30]:
vocab=Counter()
process_docs('txt_sentoken/neg' , vocab) 
process_docs('txt_sentoken/pos' , vocab)
min_occurance = 5
tokens = [k for k,c in vocab.items() if c >= min_occurance] 
save_list(tokens, 'vocab.txt')
vocab = load_doc('vocab.txt')
vocab = vocab.split() 
vocab = set(vocab)

In [10]:
def load_clean_dataset(vocab, istrain):
    neg = process_docs_after_vocab('txt_sentoken/neg' , vocab, istrain)
    pos = process_docs_after_vocab('txt_sentoken/pos' , vocab, istrain) 
    docs = neg+pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(neg))]
    return docs, labels

In [11]:
def create_tokens(lines):
    t = Tokenizer()
    t.fit_on_texts(lines)
    return t

In [12]:
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
print(len(ytest))

200


In [13]:
tokenizer = create_tokens(train_docs)

In [14]:
vocab_size = len(tokenizer.word_index) + 1

In [15]:
vocab_size

13850

In [16]:
max_length = max([len(s.split()) for s in train_docs])

In [21]:
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post' )
    return padded

In [22]:
Xtrain = encode_docs(tokenizer, max_length, train_docs)

In [24]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [25]:
model = define_model(vocab_size, max_length)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1186, 100)         1385000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1179, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 589, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 18848)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                188490    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 1,599,133
Trainable params: 1,599,133
Non-trainable params: 0
____________________________________________

In [26]:
model.fit(Xtrain, ytrain, epochs=10, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x22cc5390f88>

In [27]:
Xtest = encode_docs(tokenizer, max_length, test_docs)

In [28]:
_, acc = model.evaluate(Xtest, ytest, verbose=0)

In [29]:
acc

0.875