In [25]:
%config IPCompleter.greedy=True

In [24]:
import string 
import re 
from os import listdir 
from numpy import array 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.utils.vis_utils import plot_model 
from keras.models import Model
from keras.layers import Dense, Flatten, Dropout, Embedding, Input
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from collections import Counter
from nltk.corpus import stopwords

In [3]:
def load_doc(filename):
    f = open(filename, 'r')
    text = f.read()
    f.close()
    return text

In [4]:
def process_docs(directory,vocab):
    for fn in listdir(directory):
        if not fn.endswith('.txt'):
            next
        path = directory + '/' + fn
        add_doc_to_vocab(path, vocab)

In [5]:
def process_docs_after_vocab(directory, vocab, istrain):
    lines=list()
    for fn in listdir(directory):
        if istrain and fn.startswith('cv9' ):
            continue
        if not istrain and not fn.startswith('cv9' ):
            continue
        path = directory + '/' + fn
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

In [6]:
def doc_to_line(fn, vocab):
    doc = load_doc(fn)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [7]:
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile(' [%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub(' ' , w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [8]:
def add_doc_to_vocab(fn, vocab):
    doc = load_doc(fn)
    tokens  = clean_doc(doc)
    vocab.update(tokens)

In [9]:
def save_list(lines, fn):
    data = '\n'.join(lines)
    file = open(fn,'w')
    file.write(data)
    file.close()

In [14]:
vocab=Counter()
process_docs('txt_sentoken/neg' , vocab) 
process_docs('txt_sentoken/pos' , vocab)
min_occurance = 5
tokens = [k for k,c in vocab.items() if c >= min_occurance] 
save_list(tokens, 'vocab.txt')
vocab = load_doc('vocab.txt')
vocab = vocab.split() 
vocab = set(vocab)

In [15]:
def load_clean_dataset(vocab, istrain):
    neg = process_docs_after_vocab('txt_sentoken/neg' , vocab, istrain)
    pos = process_docs_after_vocab('txt_sentoken/pos' , vocab, istrain) 
    docs = neg+pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(neg))]
    return docs, labels

In [16]:
def create_tokens(lines):
    t = Tokenizer()
    t.fit_on_texts(lines)
    return t

In [17]:
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
print(len(ytest))

200


In [18]:
tokenizer = create_tokens(train_docs)

In [19]:
vocab_size = len(tokenizer.word_index) + 1

In [20]:
max_length = max([len(s.split()) for s in train_docs])

In [21]:
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post' )
    return padded

In [22]:
Xtrain = encode_docs(tokenizer, max_length, train_docs)
Xtest = encode_docs(tokenizer, max_length, test_docs)

In [28]:
def define_model(vocab_size, max_length):
    #chnl1
    inputs1 = Input(shape=(max_length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    #chnl2
    inputs2 = Input(shape=(max_length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    #chnl3
    inputs3 = Input(shape=(max_length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    merged = concatenate([flat1, flat2, flat3])
    dense1 = Dense(10, activation='relu')(merged)
    dense2 = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=dense2)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [29]:
model = define_model(vocab_size, max_length)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 1186)         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 1186)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1186)         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1186, 100)    1385000     input_4[0][0]                    
____________________________________________________________________________________________

In [30]:
model.fit([Xtrain, Xtrain, Xtrain], ytrain, epochs=7, batch_size=16, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.callbacks.History at 0x19fbd6d11c8>

In [31]:
_, acc = model.evaluate([Xtest, Xtest, Xtest], ytest)



In [32]:
acc

0.8550000190734863