In [26]:
import numpy as np
import pandas as pd
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

# Load & Clean the text documents

In [2]:
def load_doc(filename):
    file = open(filename,'r')
    data = file.read()
    file.close()
    return data

In [3]:
def clean_doc(data):
    tokens = data.split()
    
    re_punct = re.compile('[%s]'%re.escape(string.punctuation))
    
    cleaned_text = [re_punct.sub('',w) for w in tokens]
    
    cleaned_text = [word for word in cleaned_text if word.isalpha()]
    
    stop_words =  stopwords.words('english')
    cleaned_text = [word for word in cleaned_text if word not in stop_words]
    
    cleaned_text = [word for word in cleaned_text if len(word) > 1]
    
    return cleaned_text
    

# Define Vocbulary

In [4]:
def add_doc_to_vocab(filepath, vocabulary):
    data = load_doc(filepath)
    clean_text = clean_doc(data)
    vocabulary.update(clean_text)

In [5]:
def process_docs(directory,vocabulary):
   
    for file in listdir(directory):
        if file.startswith('cv9'):
            continue     
      
        filepath = directory+'/'+file
      
        add_doc_to_vocab(filepath, vocabulary)

In [6]:
def process_vocabulary(min_occurences,vocabulary):
    tokens = [word for word,count in vocabulary.items() if count >= min_occurences]
    return tokens


In [7]:
def save_list(word_list, file_name):
    string = '\n'.join(word_list)
    file = open(file_name,'w')
    file.write(string)
    file.close()


In [8]:
vocabulary = Counter()
negative_reviews = 'txt_sentoken/neg'
positive_reviews = 'txt_sentoken/pos'
min_occurences = 2
reviews = [negative_reviews, positive_reviews]
for directory in reviews:
    process_docs(directory,vocabulary)

In [9]:
min_occurences = 2
tokens = process_vocabulary(min_occurences,vocabulary)
save_list(tokens, 'vocabulary_new.txt')

# Train CNN with Embedding Layer

In [10]:
def clean_doc_vocab(data, vocabulary):
    tokens = data.split()
    
    re_punct = re.compile('[%s]'%re.escape(string.punctuation))
    
    cleaned_text = [re_punct.sub('',w) for w in tokens]
    
    cleaned_text = [word for word in cleaned_text if word in vocabulary]
    
    tokens = ' '.join(cleaned_text)
    
    return tokens
    

In [11]:
def process_docs_2(directory, vocabulary, is_train):
    documents = []
    for file in listdir(directory):
        if is_train and file.startswith('cv9'):
            continue
        if not is_train and not file.startswith('cv9'):
            continue
        
        file_path = directory+'/'+file
        
        data = load_doc(file_path)
        
        cleaned_data = clean_doc_vocab(data,vocabulary)
        
        documents.append(cleaned_data)
        
    return documents

In [12]:
def load_clean_docs(vocabulary, is_train):
    negative_reviews = 'txt_sentoken/neg'
    positive_reviews = 'txt_sentoken/pos'
    
    neg_docs = process_docs_2(negative_reviews, vocabulary, is_train)
    pos_docs = process_docs_2(positive_reviews, vocabulary, is_train)
    
    docs = neg_docs + pos_docs
    
    labels = np.array([0 for _ in range(len(neg_docs))] +  [1 for _ in range(len(pos_docs))])
    
    return docs, labels


In [13]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [14]:
def encode_pad(tokenizer, docs, max_len):
   
    encoded = tokenizer.texts_to_sequences(docs)
    padded_sequence = pad_sequences(encoded, maxlen = max_len, padding = 'post')
    return padded_sequence

In [31]:
def make_model(vocabulary_size, output_dimension_size, input_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim = vocabulary_size, output_dim = output_dimension_size, input_length= input_sequence_length))
    model.add(Conv1D(filters = 32, kernel_size = 8, activation = 'relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(10, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [32]:
is_train = True
vocabulary = load_doc('vocabulary_new.txt')
voacbulary = set(vocabulary.split())
train_docs,train_labels = load_clean_docs(vocabulary, is_train)
max_len = max([len(item.split()) for item in train_docs])
tokenizer = create_tokenizer(train_docs)
X_train = encode_pad(tokenizer, train_docs,max_len)

In [47]:
vocabulary_size

27675

In [48]:
input_sequence_length

2282

In [33]:
vocabulary_size = len(tokenizer.word_index)+1
output_dimension_size = 100
input_sequence_length = max_len
model = make_model(vocabulary_size,output_dimension_size, input_sequence_length)
model.fit(X_train, train_labels, epochs = 10, verbose = 2)

Epoch 1/10
 - 29s - loss: 0.6921 - accuracy: 0.5389
Epoch 2/10
 - 31s - loss: 0.5305 - accuracy: 0.7561
Epoch 3/10
 - 29s - loss: 0.1179 - accuracy: 0.9683
Epoch 4/10
 - 29s - loss: 0.0111 - accuracy: 1.0000
Epoch 5/10
 - 30s - loss: 0.0030 - accuracy: 1.0000
Epoch 6/10
 - 29s - loss: 0.0016 - accuracy: 1.0000
Epoch 7/10
 - 28s - loss: 0.0011 - accuracy: 1.0000
Epoch 8/10
 - 30s - loss: 8.1593e-04 - accuracy: 1.0000
Epoch 9/10
 - 29s - loss: 6.3230e-04 - accuracy: 1.0000
Epoch 10/10
 - 29s - loss: 5.0176e-04 - accuracy: 1.0000


<keras.callbacks.callbacks.History at 0x1a726f39d0>

In [34]:
_,acc = model.evaluate(X_train,train_labels, verbose = 2)
print("Train Acccuracy : ",acc)

Train Acccuracy :  1.0


In [35]:
model.save('model_cnn.h5')

In [36]:
is_train =  False
test_docs,test_labels = load_clean_docs(vocabulary, is_train)
X_test= encode_pad(tokenizer, test_docs, max_len)
_,acc = model.evaluate(X_test,test_labels, verbose = 2)

In [46]:
np.savetxt('train_data.out',X_train)
np.savetxt('train_labels.out',train_labels)
np.savetxt('test_data.out',X_test)
np.savetxt('test_labels.out',test_labels)

In [37]:
print("Test Acccuracy : ",acc)

Test Acccuracy :  0.8899999856948853


In [38]:
def predict_sentiment(review, vocab, tokenizer, max_length, model): # clean review
    line = clean_doc_vocab(review, vocab)
   
    # encode and pad review
    padded = encode_pad(tokenizer, [line],  max_length)
    # predict sentiment
    yhat = model.predict(padded, verbose=0)
    # retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    percent_neg = 1- percent_pos
    
    if percent_pos > percent_neg:
        return percent_pos,"Positive"
    else:
        return percent_neg, "Negative"

In [41]:
text = 'Everyone will enjoy this film. I relaly like it and my family did too. I love it, recommended!'
model = load_model('model_cnn.h5')
percent, sentiment = predict_sentiment(text, vocabulary, tokenizer, max_len, model) 
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# test negative text
text = 'This is a bad movie. Do not watch fucking watch it. Thge acting and storyline is terrible.It sucks.'
percent, sentiment = predict_sentiment(text, vocabulary, tokenizer, max_len, model) 
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Everyone will enjoy this film. I relaly like it and my family did too. I love it, recommended!]
Sentiment: Positive (61.623%)
Review: [This is a bad movie. Do not watch fucking watch it. Thge acting and storyline is terrible.It sucks.]
Sentiment: Negative (64.164%)
