# Data Preparation

In [51]:
from os import listdir
from nltk.corpus import stopwords 
import string
from collections import Counter
import re
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd

In [2]:
def load_docs(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    
    return text

In [3]:
def clean_docs(doc):
   
    # split data into tokens
    words = doc.split()
    
    # load the punctuations to be removed
    re_compile = re.compile('[%s]'%re.escape(string.punctuation))
    
    # remove the punctuations from each word from the list of words
    cleaned_text = [re_compile.sub('',w) for w in words]
    
    # remove all stop words from the text
    
    stop_words = stopwords.words('English')
    cleaned_text = [word for word in cleaned_text if word not in stop_words]
    
    # remove all numbers or other characters which are not letters
    
    cleaned_text = [word for word in cleaned_text if word.isalpha()]
    
    # remove all characters or words with length <= 1
    
    cleaned_text = [word for word in cleaned_text if len(word) > 1]
    
    return cleaned_text

In [4]:
def add_doc_to_vocab(filename, vocab):
    
    # after receving the filename , load the file from the list
    text = load_docs(filename)
    
    # clean & return the loaded file
    cleaned_text = clean_docs(text)
    
    # add all the cleaned words to the dictionary
    vocabulary.update(cleaned_text)

In [5]:
def process_docs(directory,vocabulary):
    # iterate through all the files in the directory
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        # add path to the file
        path = directory+'/'+filename
        
        # pass the specific file to the below function to add the cleaned words in the file
        # to the dictionary
        
        add_doc_to_vocab(path, vocabulary)
        

In [6]:
def process_vocabulary(vocabulary, min_occurences):

    tokens = [word for word,count in vocabulary.items() if count >= min_occurences]

    return tokens
    

In [7]:
def save_list(word_list, filename):
    data = '\n'.join(word_list)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [8]:
vocabulary = Counter()
negative_reviews = 'txt_sentoken/neg'
positive_reviews = 'txt_sentoken/pos'
min_occurences = 2
reviews = [negative_reviews, positive_reviews]
process_reviews = [ process_docs(review_directory,vocabulary)  for review_directory in reviews]

tokens = process_vocabulary(vocabulary, min_occurences)
save_list(tokens, "vocabulary_10.txt")
print(len(tokens))

25767


# Bag Of Words Representation

### Transforming Reviews to Lines of Tokens

In [9]:
def doc_to_line(filename, vocabulary):
    text = load_docs(filename)
    
    cleaned_text = clean_docs(text)
    
    tokens = [w for w in cleaned_text if w in vocabulary]
    
    return ' '.join(tokens)

Above lists the function doc to line() that will load a document, clean it, filter out tokens not in the vocabulary, then return the document as a string of white space separated tokens.

In [10]:
def process_docs_2(directory,vocabulary):
    # iterate through all the files in the directory
    lines = []
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        # add path to the file
        path = directory+'/'+filename
        
        # pass the specific file to the below function so that the cleaned words can be added as a list,
        # so bascially each review will be a list appended to the master list
        
        line = doc_to_line(path, vocabulary)
        lines.append(line)
    return lines
        

Above lists the process docs() function that does just this, expecting a directory name and a vocabulary set as input arguments and returning a list of processed documents.

In [11]:
def load_clean_data(vocabulary):
    negative_reviews_list = process_docs_2('txt_sentoken/neg',vocabulary)
    positive_reviews_list = process_docs_2('txt_sentoken/pos',vocabulary)
    
    docs = negative_reviews_list + positive_reviews_list
    labels = [0 for _ in range(len(negative_reviews_list))] + [1 for _ in range(len(positive_reviews_list))]
    
    return docs,labels

In [12]:
vocabulary = 'vocabulary_10.txt'
vocabulary = load_docs(vocabulary)


In [13]:
docs, labels = load_clean_data(vocabulary)
print(len(docs), len(labels), sep= " : ")

1800 : 1800


### Movie Reviews to Bag Of Words Vectors

In [14]:
def create_tokens(docs):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    return tokenizer

In [15]:
def process_docs_3(directory, vocabulary, is_train):
    # iterate through all the files in the directory
    lines = []
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # add path to the file
        path = directory+'/'+filename
        
        # pass the specific file to the below function so that the cleaned words can be added as a list,
        # so bascially each review will be a list appended to the master list
        
        line = doc_to_line(path, vocabulary)
        lines.append(line)
    return lines
    

In [16]:
def load_clean_data_2(vocabulary, is_train):
    negative_reviews_list = process_docs_3('txt_sentoken/neg',vocabulary,is_train)
    positive_reviews_list = process_docs_3('txt_sentoken/pos',vocabulary,is_train)
    
    docs = negative_reviews_list + positive_reviews_list
    labels = np.array([0 for _ in range(len(negative_reviews_list))] + [1 for _ in range(len(positive_reviews_list))])
    
    return docs,labels

In [17]:
train_docs, y_train = load_clean_data_2(vocabulary,True)
test_docs, y_test = load_clean_data_2(vocabulary,False)

In [18]:
tokenizer = create_tokens(train_docs)

In [19]:
Xtrain = tokenizer.texts_to_matrix(train_docs, mode = 'freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode = 'freq')

### Summary Here's what we have done so far:

 -  We had two folders : neg and pos
 -  In each of these folders there are 1000 files each, so all the files ranging from 0-899 in ether of the folders are taken as train dataset and the remaining 100, i.e. 900 to 999 are kept for test dataset
 - We go through each of the files kept for training and go through them to clean them and ultimately have a list of words, this list would be our entire vocabulary
 - Next we create a tokenizer, this tokenizer is fitted on the train dataset, which is a list of all the words in the train dataset, note that these words come from both the positive and negative reviews
 - Then we use the tokenizer.text_to_matrix to convert the train_doc to an array wherein each word is replaced by its frequency ( i.e. number of times a word appears in a document/ number of times it appears in all the docs)
 - We repeat the above step for the test doc too

# Sentiment Analysis Models

In [20]:
def make_model(n_words):
    model =Sequential()
    model.add(Dense(50, input_shape = (n_words,), activation = 'relu'))
    model.add(Dense(1,  activation = 'sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [21]:
n_words = Xtrain.shape[1]
model = make_model(n_words)
model.fit(Xtrain, y_train, batch_size = 32, epochs = 10, verbose = 2)
loss, acc = model.evaluate(Xtest, y_test, verbose = 0)
print(acc)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
 - 4s - loss: 0.6919 - accuracy: 0.5839
Epoch 2/10
 - 2s - loss: 0.6830 - accuracy: 0.8917
Epoch 3/10
 - 3s - loss: 0.6662 - accuracy: 0.8978
Epoch 4/10
 - 3s - loss: 0.6401 - accuracy: 0.9222
Epoch 5/10
 - 3s - loss: 0.6058 - accuracy: 0.9378
Epoch 6/10
 - 3s - loss: 0.5662 - accuracy: 0.9383
Epoch 7/10
 - 3s - loss: 0.5223 - accuracy: 0.9478
Epoch 8/10
 - 3s - loss: 0.4792 - accuracy: 0.9494
Epoch 9/10
 - 3s - loss: 0.4369 - accuracy: 0.9578
Epoch 10/10
 - 3s - loss: 0.3962 - accuracy: 0.9639
0.8700000047683716


# Comparing Word Scoring Methods

In [22]:
def prepare_train_test_data(method, vocabulary):
    train_docs, y_train = load_clean_data_2(vocabulary,True)
    test_docs, y_test = load_clean_data_2(vocabulary,False)
    tokenizer = create_tokens(train_docs)
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode = method)
    Xtest = tokenizer.texts_to_matrix(test_docs, mode = method)
    return Xtrain,Xtest, y_train, y_test, tokenizer

We put the training and testing dataset preparation set in a function, and pass the method as an argument

In [23]:
def evaluate_model(n_words,Xtrain,Xtest, y_train, y_test):
    n_repeats = 10
    scores = []
    for _ in range(n_repeats):
        model = make_model(n_words)
        model.fit(Xtrain, y_train, batch_size = 32, epochs = 10, verbose = 2)
        _, acc = model.evaluate(Xtest, y_test, verbose = 0)
        scores.append(acc)
    return scores

In [24]:
methods = ['binary', 'count', 'tfidf', 'freq']
results = pd.DataFrame()
for method in methods:
    Xtrain,Xtest, y_train, y_test, tokenizer = prepare_train_test_data(method, vocabulary)
    n_words = Xtrain.shape[1]
    results[method] = evaluate_model(n_words,Xtrain,Xtest, y_train, y_test)
    

Epoch 1/10
 - 3s - loss: 0.4731 - accuracy: 0.7856
Epoch 2/10
 - 2s - loss: 0.0586 - accuracy: 0.9961
Epoch 3/10
 - 2s - loss: 0.0172 - accuracy: 1.0000
Epoch 4/10
 - 2s - loss: 0.0083 - accuracy: 1.0000
Epoch 5/10
 - 2s - loss: 0.0048 - accuracy: 1.0000
Epoch 6/10
 - 2s - loss: 0.0028 - accuracy: 1.0000
Epoch 7/10
 - 2s - loss: 0.0018 - accuracy: 1.0000
Epoch 8/10
 - 2s - loss: 0.0013 - accuracy: 1.0000
Epoch 9/10
 - 2s - loss: 9.2482e-04 - accuracy: 1.0000
Epoch 10/10
 - 2s - loss: 6.9851e-04 - accuracy: 1.0000
Epoch 1/10
 - 3s - loss: 0.4712 - accuracy: 0.7850
Epoch 2/10
 - 3s - loss: 0.0555 - accuracy: 0.9967
Epoch 3/10
 - 2s - loss: 0.0143 - accuracy: 1.0000
Epoch 4/10
 - 2s - loss: 0.0066 - accuracy: 1.0000
Epoch 5/10
 - 2s - loss: 0.0038 - accuracy: 1.0000
Epoch 6/10
 - 2s - loss: 0.0025 - accuracy: 1.0000
Epoch 7/10
 - 2s - loss: 0.0017 - accuracy: 1.0000
Epoch 8/10
 - 2s - loss: 0.0013 - accuracy: 1.0000
Epoch 9/10
 - 2s - loss: 9.8083e-04 - accuracy: 1.0000
Epoch 10/10
 - 2s 

Epoch 9/10
 - 2s - loss: 0.0010 - accuracy: 1.0000
Epoch 10/10
 - 2s - loss: 8.2372e-04 - accuracy: 1.0000
Epoch 1/10
 - 3s - loss: 0.4601 - accuracy: 0.7833
Epoch 2/10
 - 2s - loss: 0.0653 - accuracy: 0.9878
Epoch 3/10
 - 2s - loss: 0.0164 - accuracy: 0.9994
Epoch 4/10
 - 2s - loss: 0.0073 - accuracy: 1.0000
Epoch 5/10
 - 2s - loss: 0.0042 - accuracy: 1.0000
Epoch 6/10
 - 2s - loss: 0.0028 - accuracy: 1.0000
Epoch 7/10
 - 2s - loss: 0.0020 - accuracy: 1.0000
Epoch 8/10
 - 2s - loss: 0.0015 - accuracy: 1.0000
Epoch 9/10
 - 2s - loss: 0.0012 - accuracy: 1.0000
Epoch 10/10
 - 2s - loss: 9.4613e-04 - accuracy: 1.0000
Epoch 1/10
 - 3s - loss: 0.4645 - accuracy: 0.7839
Epoch 2/10
 - 2s - loss: 0.0530 - accuracy: 0.9939
Epoch 3/10
 - 2s - loss: 0.0148 - accuracy: 1.0000
Epoch 4/10
 - 2s - loss: 0.0067 - accuracy: 1.0000
Epoch 5/10
 - 2s - loss: 0.0039 - accuracy: 1.0000
Epoch 6/10
 - 2s - loss: 0.0025 - accuracy: 1.0000
Epoch 7/10
 - 2s - loss: 0.0018 - accuracy: 1.0000
Epoch 8/10
 - 2s - lo

Epoch 5/10
 - 2s - loss: 0.6461 - accuracy: 0.9139
Epoch 6/10
 - 2s - loss: 0.6185 - accuracy: 0.9161
Epoch 7/10
 - 2s - loss: 0.5857 - accuracy: 0.9417
Epoch 8/10
 - 2s - loss: 0.5504 - accuracy: 0.9428
Epoch 9/10
 - 2s - loss: 0.5131 - accuracy: 0.9539
Epoch 10/10
 - 2s - loss: 0.4767 - accuracy: 0.9561
Epoch 1/10
 - 3s - loss: 0.6915 - accuracy: 0.5550
Epoch 2/10
 - 2s - loss: 0.6823 - accuracy: 0.6733
Epoch 3/10
 - 2s - loss: 0.6656 - accuracy: 0.8006
Epoch 4/10
 - 2s - loss: 0.6414 - accuracy: 0.8606
Epoch 5/10
 - 2s - loss: 0.6101 - accuracy: 0.9211
Epoch 6/10
 - 2s - loss: 0.5741 - accuracy: 0.9428
Epoch 7/10
 - 2s - loss: 0.5355 - accuracy: 0.9506
Epoch 8/10
 - 2s - loss: 0.4955 - accuracy: 0.9533
Epoch 9/10
 - 2s - loss: 0.4552 - accuracy: 0.9600
Epoch 10/10
 - 2s - loss: 0.4168 - accuracy: 0.9667
Epoch 1/10
 - 3s - loss: 0.6919 - accuracy: 0.5367
Epoch 2/10
 - 2s - loss: 0.6829 - accuracy: 0.5611
Epoch 3/10
 - 2s - loss: 0.6642 - accuracy: 0.7778
Epoch 4/10
 - 2s - loss: 0.63

In [25]:
results

Unnamed: 0,binary,count,tfidf,freq
0,0.93,0.91,0.905,0.87
1,0.935,0.91,0.885,0.86
2,0.925,0.915,0.885,0.875
3,0.93,0.89,0.86,0.865
4,0.935,0.9,0.885,0.87
5,0.92,0.905,0.87,0.865
6,0.92,0.905,0.85,0.855
7,0.93,0.905,0.91,0.87
8,0.92,0.9,0.885,0.865
9,0.915,0.9,0.86,0.87


In [26]:
results.describe()

Unnamed: 0,binary,count,tfidf,freq
count,10.0,10.0,10.0,10.0
mean,0.926,0.904,0.8795,0.8665
std,0.006992,0.006992,0.019501,0.005798
min,0.915,0.89,0.85,0.855
25%,0.92,0.9,0.8625,0.865
50%,0.9275,0.905,0.885,0.8675
75%,0.93,0.90875,0.885,0.87
max,0.935,0.915,0.91,0.875


In [48]:
# def predict_sentiment(text, vocabulary, model, tokenizer):
       
#     cleaned_text = clean_docs(text)

#     cleaned_text = [w for w in cleaned_text if w in vocabulary]

#     line = ' '.join(cleaned_text)

#     encoded = tokenizer.texts_to_matrix([line], mode = 'binary')
    
#     #print(model.summmary())
# #     y_hat = model.predict(encoded, verbose = 0)

# #     percent_positive = y_hat[0,0]
    
# #     if round(percent_positive) == 0:
# #         return (1-percent_positive),"Negative"
# #     return percent_positive,"Positive"

In [33]:
model.fit(Xtrain, y_train, batch_size = 32, epochs = 10, verbose = 2)

Epoch 1/10
 - 2s - loss: 0.3588 - accuracy: 0.9639
Epoch 2/10
 - 2s - loss: 0.3259 - accuracy: 0.9733
Epoch 3/10
 - 2s - loss: 0.2940 - accuracy: 0.9800
Epoch 4/10
 - 2s - loss: 0.2665 - accuracy: 0.9822
Epoch 5/10
 - 2s - loss: 0.2412 - accuracy: 0.9861
Epoch 6/10
 - 2s - loss: 0.2193 - accuracy: 0.9861
Epoch 7/10
 - 2s - loss: 0.1985 - accuracy: 0.9906
Epoch 8/10
 - 2s - loss: 0.1805 - accuracy: 0.9917
Epoch 9/10
 - 2s - loss: 0.1648 - accuracy: 0.9928
Epoch 10/10
 - 2s - loss: 0.1496 - accuracy: 0.9956


<keras.callbacks.callbacks.History at 0x1a5ad84150>

In [50]:
# text = 'Best movie ever! It was great, I recommend it.'
# predict_sentiment(text, vocabulary, tokenizer, model) 

In [45]:
# text = 'Best movie ever! It was great, I recommend it.'
# percent, sentiment = predict_sentiment(text, vocabulary, tokenizer, model) 
# print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100)) 
# # test negative text
# text = 'This is a bad movie.'
# percent, sentiment = predict_sentiment(text, vocabulary, tokenizer, model) 
# print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

In [43]:
text = 'Best movie ever! It was great, I recommend it.'
cleaned_text = clean_docs(text)

cleaned_text = [w for w in cleaned_text if w in vocabulary]

line = ' '.join(cleaned_text)

encoded = tokenizer.texts_to_matrix([line], mode = 'binary')

y_hat = model.predict(encoded, verbose = 0)

percent_positive = y_hat[0,0]

if round(percent_positive) == 0:
    print ((1-percent_positive),"Negative", sep= " : ")
else:
    print(percent_positive,"Positive", sep= " : ")

1.0 : Positive


In [44]:
text = 'This is a bad movie.'
cleaned_text = clean_docs(text)

cleaned_text = [w for w in cleaned_text if w in vocabulary]

line = ' '.join(cleaned_text)

encoded = tokenizer.texts_to_matrix([line], mode = 'binary')

y_hat = model.predict(encoded, verbose = 0)

percent_positive = y_hat[0,0]

if round(percent_positive) == 0:
    print ((1-percent_positive),"Negative", sep= " : ")
else:
    print(percent_positive,"Positive", sep= " : ")

0.9999999945189377 : Negative
