In [432]:
#***************************************************************************************
#    Title: How To Perform Sentiment Analysis in Python 3 Using the Natural Language Toolkit (NLTK)
#    Author: Shaumik Daityari and Haley Mills
#    Date: 2019
#    Availability: https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
#
#   [Source code]. https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
#***************************************************************************************

#***************************************************************************************
#    Author:Justin O Barber
#    Date: 2013
#    [Source Code]. https://stackoverflow.com/a/20827919
#***************************************************************************************

#****************************************************************************************
#   @incollection{SocherEtAl2013:RNTN,
#   title = {{Parsing With Compositional Vector Grammars}},
#   author = {Richard Socher and Alex Perelygin and Jean Wu and Jason Chuang and Christopher Manning and Andrew Ng and Christopher Potts},
#   booktitle = {{EMNLP}},
#   year = {2013}
#
#   [Dataset]. https://nlp.stanford.edu/sentiment/index.html
#****************************************************************************************

In [433]:
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [434]:
#put any imports here
import re, nltk, math, string
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import classify, NaiveBayesClassifier
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag

In [435]:
# This function normalizes the database
def remove_noise(data, stop_words = ()):
    
    # empty list to store normalized data
    cleaned_tokens = []
    
    # passage looks like this: ('data value', 'pos tag', 'sentiment value')
    for passage in (data):
        text = ''
        # Within the if statement, if the tag starts with NN, the token is assigned as a noun. 
        # Similarly, if the tag starts with VB, the token is assigned as a verb
        for token in word_tokenize(passage[0]):
            if passage[1].startswith("NN"):
                pos = 'n'
            elif passage[1].startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
                
            # initialize the lemmatizer
            lemmatizer = WordNetLemmatizer()
            # lemmatize the data value at index 0, hence passage[0] with it's tag
            token = lemmatizer.lemmatize(token, pos)

            #store each token in text variable to maintain original phrase
            if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
                text += token.lower() + ' '
                
        # store lemmatized phrase in cleaned_tokens variable along with sentiment value
        if text:
            cleaned_tokens.append((text.strip(), passage[-1]))
    return cleaned_tokens

In [436]:
# This function changes the list format of the data so that the first element 
# of each tuple is a dictionary of features; Creates a dictionary of vocabulary
def get_all_words(data):
    
    # create empty dictionary
    return_set = {}
    
    for passage in (data):
        
        # passage[0] gets the phrase, because the list looks like: ('your hair', 'Positive')
        for word in word_tokenize(passage[0]):
            
            # if the return_set is empty, initialize return_set
            if not return_set:
                return_set = {(word.lower())}
                
            # if return_set isn't empty, add to the dictionary
            else:
                return_set.add((word.lower()))       
    return return_set

In [437]:
# From each phrase and sentiment value
# we check if a feature or word in the dictionary of vocabulary we created above exist or not.

# For each phrase and sentiment value, return_list stores all possible words in the vocabulary
# and whether or not the words exist in the phrase, indicated by True or False
def tokenize_words(all_words, data):
    
    # create empty list
    return_list = []
    
    for d in data:
        append_val = ({word: (word in word_tokenize(d[0])) for word in all_words}, d[1])
        return_list.append(append_val)
        
    return return_list

#### Dataset:

We used the movie reviews dataset from Stanford University. This dataset has 7 different files: original_rt_snippets.txt, dictionary.txt, sentiment_labels.txt, SOStr.txt, STree.txt, datasetSentences.txt and datasetSplit.txt. From these files, we used dictionary.txt and sentiment_labels.txt.

The dictionary.txt file contains all of the processed phrases from the movie reviews with ids. For instance, "It was a good movie" | 1. This line contains the phrase "It was a good movie", and the id for it is 1. 

The sentiment_labels.txt file contains all of the ids and the sentiment values for the id's corresponding phrase. For instance, 1 | 0.5. This means that the phrase with id 1 has a sentiment value of 0.5. This dataset divided sentiment values into 5 subsets: [0, 0.2] very negative, (0.2, 0.4] negative, (0.4, 0.6] neutral, (0.6, 0.8] positive, (0.8, 1.0] very positive. To keep the project simple, we decided to change the subsets to [0, 0.5] negative and (0.5, 1.0] positive.

#### Creating database:

We first opened the two files and stored the contents into the corresponding variables, dictData and sentimentData. The first line in the sentimentData file was an irrelevant line to the data, hence we read the file starting from line 1, instead of line 0. 

The phrases in the dictionary file was not in the order of the id values, hence we first sorted the contents of this file according to the ids, to match the order of the sentimentData file.

In [438]:
# create a variable to hold the path to the file
filePath = "./dataset/dictionary.txt"

# open the file as "r" or read only and store this opened file in f
with open(filePath, "r", encoding = "utf8") as f:
    # read the data from f and store it in the string variable "data"
    dictData = f.readlines()[:]

# create a variable to hold the path to the file
filePath = "./dataset/sentiment_labels.txt"

# open the file as "r" or read only and store this opened file in f
with open(filePath, "r", encoding = "utf8") as f:
    # read the data from f and store it in the string variable "data"
    sentimentData = f.readlines()[1:]
    
# sort the data in the dictbionary data in order of the matched id
dictData.sort(key=lambda dictData : list(
    map(int, re.findall(r"\|(\d+)", dictData)))[0])

#### Cleaning data:

After putting dictData in the order of the id values, we split each phrase in dictData and only keep the data value.

Ex. We take dictData[0], which looks like ('data value | id') and change it to look like ('data value')

We then used the pos_tag function to get the tags for each phrase in dictData to determine the context for each phrase and store it in tagged_data. 
We then modified the tagged_data to store the data value, the pos tag and the sentiment value that corresponds to each phrase. The data tagged_data[0] now looks like this:

    ('data value', 'pos tag', 'sentiment value')
    
We then removed the noise using the remove_noise function.

In [439]:
# assign values to dictData to only contain phrases without id
i = 0
while i < len(dictData):
    
    text = (dictData[i].split('|')[0]).strip()
    dictData[i] = text
    i += 1

# use pos_tag function to get the tags for each phrase in dictData
tagged_data = pos_tag(dictData)

# assign values to tagged_data to include phrase, pos tag and corresponding sentiment value
# no need to check and compare id values, because sentimentData and tagged_data are already in order of id
j = 0
while j < len(tagged_data):
    text = (tagged_data[j][0], tagged_data[j][1], (sentimentData[j].split('|')[1]).strip())
    tagged_data[j] =  text
    j += 1

# call remove_noise on tagged_data to remove stop words and normalize the data
stop_words = stopwords.words('english')
cleaned_data = remove_noise(tagged_data, stop_words = ())

We then created list variable, dataBase, to store the phrases and the labels (Positive or Negative), according to the sentiment score subsets stated above: [0, 0.5] negative and (0.5, 1.0] positive.

In [440]:
i = 0
dataBase = []

# label data as positive or negative depending on the sentiment scores given
# greater than 0.5 is a positive score, else, negative
for data in cleaned_data:
    if float((data[1]).strip()) >= 0.5:
        dataBase.append((data[0], "Positive"))
    else:
        dataBase.append((data[0], "Negative"))

#### Training and testing the database:
The range of train and test data is kept to a minimum, due to the lack of computer power.
In theory, we should be able to train and test the data with the entire database with a 70% (train) to 30% (test) ratio. 

We did not need to randomize the picking of train and test dataset, because the sentiment.txt was already in mixed order. In other words, the database doesn't list all positive phrases and then negative phrases. Because we labeled the phrases in the order of the id values listed in the sentiment.txt, the sentiment values are in mixed order of positives and negatives.

Below is a snippet of the database, showing the mixed order of sentiment values.

In [441]:
print(dataBase[:10])

[('the cockettes', 'Positive'), ('the cockettes', 'Negative'), ('the cockettes provides a window into a subculture hell-bent on expressing itself in every way imaginable', 'Negative'), ('the cockettes provides a window into a subculture hell-bent on expressing itself in every way imaginable', 'Negative'), ('the cockettes provides a window into a subculture hell-bent on expressing itself in every way imaginable', 'Positive'), ('a nightmare on elm street', 'Negative'), ('a nightmare on elm street', 'Negative'), ('a nightmare on elm street or', 'Negative'), ('a nightmare on elm street or', 'Positive'), ('a nightmare on elm street or the hill', 'Negative')]


In [442]:
# Main
if __name__ == "__main__":
    # Split data into train and test sets
    train_data = dataBase[:300]
    test_data = dataBase[300:315]
    
    # Tokenize training and test data
    all_words_train = get_all_words(train_data)
    train_features = tokenize_words(all_words_train, train_data)
    
    all_words_test = get_all_words(test_data)
    test_features = tokenize_words(all_words_test, test_data)
    
    # Train Naive Bayes classifier
    classifier = NaiveBayesClassifier.train(train_features)
    print("Accuracy is:", classify.accuracy(classifier, test_features))
    classifier.show_most_informative_features()

Accuracy is: 0.8
Most Informative Features
                    this = True           Negati : Positi =     10.5 : 1.0
            particularly = True           Negati : Positi =      9.2 : 1.0
               nightmare = True           Negati : Positi =      8.0 : 1.0
                     elm = True           Negati : Positi =      6.8 : 1.0
                  street = True           Negati : Positi =      6.8 : 1.0
                   clash = True           Negati : Positi =      5.5 : 1.0
              artificial = True           Negati : Positi =      4.3 : 1.0
                 between = True           Negati : Positi =      4.3 : 1.0
                sardonic = True           Negati : Positi =      4.3 : 1.0
                     see = True           Negati : Positi =      4.3 : 1.0


In [443]:
    # Test on custom input
    custom_review = "I didn't enjoy this at all."
    custom_features = {word: (word in word_tokenize(custom_review.lower())) for word in all_words_train}
    print(custom_review, "->", classifier.classify(custom_features))

I didn't enjoy this at all. -> Negative
