In [19]:
import numpy as np
import pandas as pd
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model
from keras.layers.merge import concatenate

In [3]:
def load_doc(filename):
    file = open(filename,'r')
    data = file.read()
    file.close()
    return data

In [4]:
def clean_doc(data):
    tokens = data.split()
    
    re_punct = re.compile('[%s]'%re.escape(string.punctuation))
    
    cleaned_text = [re_punct.sub('',w) for w in tokens]
    
    cleaned_text = [word for word in cleaned_text if word.isalpha()]
    
    stop_words =  stopwords.words('english')
    cleaned_text = [word for word in cleaned_text if word not in stop_words]
    
    cleaned_text = [word for word in cleaned_text if len(word) > 1]
    
    return cleaned_text
    

In [5]:
def add_doc_to_vocab(filepath, vocabulary):
    data = load_doc(filepath)
    clean_text = clean_doc(data)
    vocabulary.update(clean_text)

In [6]:
def process_docs(directory,vocabulary):
   
    for file in listdir(directory):
        if file.startswith('cv9'):
            continue     
      
        filepath = directory+'/'+file
      
        add_doc_to_vocab(filepath, vocabulary)

In [7]:
def process_vocabulary(min_occurences,vocabulary):
    tokens = [word for word,count in vocabulary.items() if count >= min_occurences]
    return tokens

In [8]:
def save_list(word_list, file_name):
    string = '\n'.join(word_list)
    file = open(file_name,'w')
    file.write(string)
    file.close()


In [9]:
vocabulary = Counter()
negative_reviews = 'txt_sentoken/neg'
positive_reviews = 'txt_sentoken/pos'
min_occurences = 2
reviews = [negative_reviews, positive_reviews]
for directory in reviews:
    process_docs(directory,vocabulary)

In [10]:
min_occurences = 2
tokens = process_vocabulary(min_occurences,vocabulary)
save_list(tokens, 'vocabulary_new_3.txt')

In [11]:
def clean_doc_vocab(data, vocabulary):
    tokens = data.split()
    
    re_punct = re.compile('[%s]'%re.escape(string.punctuation))
    
    cleaned_text = [re_punct.sub('',w) for w in tokens]
    
    cleaned_text = [word for word in cleaned_text if word in vocabulary]
    
    tokens = ' '.join(cleaned_text)
    
    return tokens 

In [12]:
def process_docs_2(directory, vocabulary, is_train):
    documents = []
    for file in listdir(directory):
        if is_train and file.startswith('cv9'):
            continue
        if not is_train and not file.startswith('cv9'):
            continue
        
        file_path = directory+'/'+file
        
        data = load_doc(file_path)
        
        cleaned_data = clean_doc_vocab(data,vocabulary)
        
        documents.append(cleaned_data)
        
    return documents

In [13]:
def load_clean_docs(vocabulary, is_train):
    negative_reviews = 'txt_sentoken/neg'
    positive_reviews = 'txt_sentoken/pos'
    
    neg_docs = process_docs_2(negative_reviews, vocabulary, is_train)
    pos_docs = process_docs_2(positive_reviews, vocabulary, is_train)
    
    docs = neg_docs + pos_docs
    
    labels = np.array([0 for _ in range(len(neg_docs))] +  [1 for _ in range(len(pos_docs))])
    
    return docs, labels

In [14]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [15]:
def encode_pad(tokenizer, docs, max_len):
   
    encoded = tokenizer.texts_to_sequences(docs)
    padded_sequence = pad_sequences(encoded, maxlen = max_len, padding = 'post')
    return padded_sequence

In [16]:
is_train = True
vocabulary = load_doc('vocabulary_new_3.txt')
voacbulary = set(vocabulary.split())
train_docs,train_labels = load_clean_docs(vocabulary, is_train)
review_len = [len(item.split()) for item in train_docs]
max_len = max(review_len)
tokenizer = create_tokenizer(train_docs)
X_train = encode_pad(tokenizer, train_docs,max_len)

In [17]:
is_train =  False
test_docs,test_labels = load_clean_docs(vocabulary, is_train)
X_test= encode_pad(tokenizer, test_docs, max_len)

In [18]:
vocabulary_size = len(tokenizer.word_index)+1
output_dimension_size = 100
input_sequence_length = max_len

In [71]:
def make_model(input_sequnce, voabulary_length):
    input1 = Input(shape = (input_sequnce,))
    embedding = Embedding(voabulary_length, 100)(input1)
    cnn1 = Conv1D(32, 4, activation = 'relu')(embedding)
    dropout = Dropout(0.2)(cnn1)
    max_pool1 = MaxPooling1D()(dropout)
    flatten1 = Flatten()(max_pool1)
    
    input2 = Input(shape = (input_sequnce,))
    embedding = Embedding(voabulary_length, 100)(input2)
    cnn2 = Conv1D(32, 6, activation = 'relu')(embedding)
    dropout = Dropout(0.2)(cnn2)
    max_pool2 = MaxPooling1D()(dropout)
    flatten2 = Flatten()(max_pool2)
   
    input3 = Input(shape = (input_sequnce,))
    embedding = Embedding(voabulary_length, 100)(input3)
    cnn3 = Conv1D(32, 8, activation = 'relu')(embedding)
    dropout = Dropout(0.2)(cnn3)
    max_pool3 = MaxPooling1D()(dropout)
    flatten3 = Flatten()(max_pool3)

    
    merged = concatenate([flatten1, flatten2, flatten3])
    
    dense1 = Dense(20, activation = 'relu')(merged)
    dropout = Dropout(0.2)(dense1)
    dense2 = Dense(10, activation = 'relu')(dropout)
    
    outputs = Dense(1, activation = 'sigmoid')(dense2)
    model = Model(inputs = [input1,input2,input3], outputs = outputs)
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

In [72]:
model = make_model(input_sequence_length, vocabulary_size)

In [73]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 2282)         0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, 2282)         0                                            
__________________________________________________________________________________________________
input_35 (InputLayer)           (None, 2282)         0                                            
__________________________________________________________________________________________________
embedding_33 (Embedding)        (None, 2282, 100)    2767500     input_33[0][0]                   
____________________________________________________________________________________________

In [74]:
results = pd.DataFrame()
accuracy = []
batches = []
epochs = []
for b_size in [16,32,64]:
    for epoch in [10,20,30]:
        model.fit([X_train, X_train, X_train], train_labels,batch_size = b_size, epochs = epoch)
        _,acc = model.evaluate([X_test, X_test, X_test], test_labels)
        accuracy.append(acc)
        batches.append(b_size)
        epochs.append(epoch)

results['Batch'] = batches
results['Epochs'] = epochs
results['Accuracy'] = accuracy

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/30
Epoch 2/30


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [76]:
results.sort_values(by = 'Accuracy', ascending = False)

Unnamed: 0,Batch,Epochs,Accuracy
0,16,10,0.885
1,16,20,0.86
3,32,10,0.845
2,16,30,0.84
4,32,20,0.84
5,32,30,0.83
7,64,20,0.82
8,64,30,0.82
6,64,10,0.815


In [34]:
model = load_model('model_cnn.h5')

In [35]:
model.evaluate(X_test, test_labels)[1]



0.8899999856948853