In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np
import csv
import os
import string
#
class Document:
    """
    This class oversees reading, cleaning and formatting of the text corpus
    """
    #
    def __init__(self, path=None):
        if path is not None:
            BASE_DIR = os.path.join( os.path.dirname(os.getcwd()))
            self.path = BASE_DIR + path
        # Emotion Categories (Classes)
        self.emotions = ("anger","anticipation","disgust","fear","joy","sadness","surprise","trust")
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
    #
    def __open_file_and_return_dump(self):
        """ 
        Opens file and returns a dump containing all the data.
        The data structure used to record the dumped data is a list of strings,
        where each string represents a single record).
        """
        corpus = []
        with open(self.path) as csvfile:
            rows = csv.reader(csvfile)
            for row in rows:
                temp_string = ""
                for item in row:
                    temp_string += str(item) + " "
                corpus.append(temp_string)
        return corpus
    #
    def __split_labels_from_vectors(self):
        """
        Takes corpus as input, and splits the data into respective labels and training data.
        Returns this data as two separate lists
        """
        #
        corpus = self.__open_file_and_return_dump()
        label_count = len(self.emotions)
        character_splitter = '\t'
        train_y, train_X = [], []
        for row in corpus:
            groups = row.split(character_splitter)
            train_y.append(character_splitter.join(groups[:label_count]))
            train_X.append(character_splitter.join(groups[label_count:]))
        return train_y, train_X
    #
    def __tokenize_corpus(self, data, delimeter=" "):
        """
        Takes a list as input, and tokenizes the data.
        """
        corpus = []
        [(corpus.append(row.split(delimeter))) for row in data]
        return corpus
    #
    def __replace(self, word, symbols, placeholder=""):
        """
        An overriding of the original python method, so as to replace all characters 
        in a string based on whether they occur in a list.
        """
        temp_string = word
        for symbol in symbols:
            temp_string = temp_string.replace(symbol, placeholder)
        return temp_string
    #
    def __remove_stop_words(self, data):
        """
        Takes a list of data, and iterates over each element to 
        scan and remove stop words.

        The method ensures to convert all text instances to lowercase.

        The method ensures to remove jargon symbols (#,%,&,etc).

        This method ensures to remove punctuation.
        """
        temp_list, jargon_symbols = [], ('%','$','#','@','^','&','*','(',')','+','/','\'','!','?','.',',',':',';','~','0','1','2','3','4','5','6','7','8','9')
        for row in data:
            filtered_words = [self.__replace(word.lower().translate(string.punctuation), symbols=jargon_symbols) for word in row if word.lower() not in stopwords.words('english')]          
            temp_list.append(filtered_words)
        #
        new_list = []
        for row in temp_list:
            row_list = []
            for word in row:
                if word != "":
                    row_list.append(word)
            new_list.append(row_list)
        #
        return new_list
    #
    def __word_morhpology(self, data, stemming=0, lemmatizing=0):
        temp_list = []
        for row in data:
            filtered_words = row
            if stemming == 1:
                # Stemming
                filtered_words = [self.stemmer.stem(word) for word in filtered_words]
            #
            if stemming == 1:
                # Lemmatizing
                filtered_words = [self.lemmatizer.lemmatize(word) for word in filtered_words]
            #
            temp_list.append(filtered_words)
        #
        return temp_list
    #
    def get_emotions(self):
        return self.emotions
    #
    def clean_corpus(self):
        train_y, train_X = self.__split_labels_from_vectors()
        tokenized_X, tokenized_y = self.__tokenize_corpus(data=train_X), self.__tokenize_corpus(data=train_y, delimeter="\t")
        cleaned_tokenized_X, cleaned_tokenized_y = self.__remove_stop_words(data=tokenized_X), self.__remove_stop_words(data=tokenized_y)
        cleaned_tokenized_X = self.__word_morhpology(cleaned_tokenized_X,1,1)
        return cleaned_tokenized_X, cleaned_tokenized_y
    #
    def binarize(self, y):
        value = "---"
        y_binarized = []
        for i in range(len(y)):
            y_binarized.append([])
            for element in y[i]:
                if value in element:
                    y_binarized[i].append(0)
                else:
                    y_binarized[i].append(1)
        return y_binarized
#
training_document = Document(path="\\data\\ssec-aggregated\\train-combined-0.0.csv")
cleaned_tokenized_train_X, cleaned_tokenized_train_y = training_document.clean_corpus()
#
# This will be used later for evaluation purposes
testing_document = Document(path= "\\data\\ssec-aggregated\\test-combined-0.0.csv")
cleaned_tokenized_test_X, cleaned_tokenized_test_y = testing_document.clean_corpus()
binarized_cleaned_tokenized_test_y = testing_document.binarize(cleaned_tokenized_test_y)
#
print("First 5 lines of training corpus: \n" + str(cleaned_tokenized_train_X[0:5]) + "\n")
print("First 5 lines of training corpus: \n" + str(cleaned_tokenized_train_y[0:5]) + "\n")
print("First 5 lines of testing corpus: \n" + str(cleaned_tokenized_test_y[0:5]) + "\n")
print("First 5 lines of testing corpus: \n" + str(binarized_cleaned_tokenized_test_y[0:5]) + "\n")
print("Shape of training corpus: \n" + str(np.shape(cleaned_tokenized_train_X)))
print("Shape of training corpus: \n" + str(np.shape(cleaned_tokenized_train_y)))
print("Shape of testing corpus: \n" + str(np.shape(cleaned_tokenized_test_y)))
print("Shape of testing corpus: \n" + str(np.shape(binarized_cleaned_tokenized_test_y)))

In [None]:
class NaiveBayes:
    """
    This class oversees the Naive Bayes Model Template
    """
    #
    def __init__(self, emotion_index):
        """
        Constructor
        """
        d = Document()
        self.emotions = d.get_emotions()
        self.emotion_index = emotion_index
        self.alpha = 1 # Laplace Smoothener
        if self.emotion_index > len(self.emotions):
            print("Index exceeds the current emotion list capacity. Index musn't exceed " + str(len(self.emotions)))
        self.BOW_0, self.BOW_1 = {}, {} # Separate BOW models (BOW_1 - For likely vocab, BOW_0 - For not likely vocab)
        self.word_count_category_0, self.word_count_category_1 = 0,0 
    #
    def __get_sentiment(self):
        """
        Returns sentiment of this instance classifier
        """
        return self.emotions[self.emotion_index]
    #
    def __prod(self,iterable):
        """
        Product multiplier - Accepts list and returns product of all list elements
        """
        total = 1
        for number in iterable:
            total *= number
        return total
    #
    def __calculate_NB(self, liklihood_prob, prior_prob):
        return self.__prod(liklihood_prob) * prior_prob
    #
    def fit(self, X, y):
        """
        Accepts training data and training labels, which will form the basis of the Naive Bayes model.
        This function oversees cosntruction of the bag of words model. 
        """
        selected_emotion = self.__get_sentiment()
        #
        for i in range(len(y)):
            if selected_emotion in y[i]:
                for word in X[i]:
                    self.word_count_category_1 += 1
                    if word in self.BOW_1:
                        self.BOW_1[word] += 1
                    else:
                        self.BOW_1[word] = 1
            else:
                for word in X[i]:
                    self.word_count_category_0 += 1
                    if word in self.BOW_0:
                        self.BOW_0[word] += 1
                    else:
                        self.BOW_0[word] = 1
    #
    def predict_proba(self, X):
        """
        Accepts input document, and returns estimated liklihoods for an emotion holding and not holding
        """
        prob_0, prob_1 = 0,0
        #
        # Calculating the liklihood probability of having this emotion
        likelihood_probabilities = []
        for word in X:
            if word in self.BOW_1:
                likelihood_probability = (self.BOW_1[word] + self.alpha) / (self.word_count_category_1 + len(self.BOW_1))
                likelihood_probabilities.append(likelihood_probability)
        prior_prob = sum(self.BOW_1.values()) / (sum(self.BOW_0.values()) + sum(self.BOW_1.values()))
        prob_1 = self.__calculate_NB(likelihood_probabilities, prior_prob)
        #
        # Calculating the liklihood probability of not having this emotion
        likelihood_probabilities = []
        for word in X:
            if word in self.BOW_0:
                likelihood_probability = (self.BOW_0[word] + self.alpha) / (self.word_count_category_0 + len(self.BOW_0))
                likelihood_probabilities.append(likelihood_probability)
        prior_prob = sum(self.BOW_0.values()) / (sum(self.BOW_0.values()) + sum(self.BOW_1.values()))
        prob_0 = self.__calculate_NB(likelihood_probabilities, prior_prob)
        #
        return prob_0, prob_1
    #
    def predict(self, X):
        """
        A wrapper function for: 
        
        self.predict_proba
        
        Returns whether emotion holds or not
        """
        prob_0, prob_1 = self.predict_proba(X)
        if prob_0 > prob_1:
            return 0 # Emotion is not likely
        else:
            return 1 # Emotion is likely
#
NB_classifier_anger = NaiveBayes(0)
NB_classifier_anger.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
NB_classifier_anticipation = NaiveBayes(1)
NB_classifier_anticipation.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
NB_classifier_disgust = NaiveBayes(2)
NB_classifier_disgust.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
NB_classifier_fear = NaiveBayes(3)
NB_classifier_fear.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
NB_classifier_joy = NaiveBayes(4)
NB_classifier_joy.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
NB_classifier_sadness = NaiveBayes(5)
NB_classifier_sadness.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
NB_classifier_surprise = NaiveBayes(6)
NB_classifier_surprise.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
NB_classifier_trust = NaiveBayes(7)
NB_classifier_trust.fit(X=cleaned_tokenized_train_X, y=cleaned_tokenized_train_y)
#
classifiers = [NB_classifier_anger, 
               NB_classifier_anticipation, 
               NB_classifier_disgust, 
               NB_classifier_fear, 
               NB_classifier_joy, 
               NB_classifier_sadness, 
               NB_classifier_surprise, 
               NB_classifier_trust]
#
# Quick Testing
test = "If you want to live as a healer".split()
pred = NB_classifier_anger.predict(test)
print("Anger: " + str(pred))
pred = NB_classifier_anticipation.predict(test)
print("Anticipation: " + str(pred))
pred = NB_classifier_disgust.predict(test)
print("Disgust: " + str(pred))
pred = NB_classifier_fear.predict(test)
print("Fear: " + str(pred))
pred = NB_classifier_joy.predict(test)
print("Joy: " + str(pred))
pred = NB_classifier_sadness.predict(test)
print("Sadness: " + str(pred))
pred = NB_classifier_surprise.predict(test)
print("Surprise: " + str(pred))
pred = NB_classifier_trust.predict(test)
print("Trust: " + str(pred))

In [None]:
y_true, y_pred = binarized_cleaned_tokenized_test_y, []
for sentence in cleaned_tokenized_test_X:
    predictions = [NB_classifier_anger.predict(sentence),
                   NB_classifier_anticipation.predict(sentence),
                   NB_classifier_disgust.predict(sentence),
                   NB_classifier_fear.predict(sentence),
                   NB_classifier_joy.predict(sentence),
                   NB_classifier_sadness.predict(sentence),
                   NB_classifier_surprise.predict(sentence),
                   NB_classifier_trust.predict(sentence),]
    y_pred.append(predictions)
#
def flatten(iterable):
    flattened_list = []
    for row in iterable:
        for element in row:
            flattened_list.append(element)
    return flattened_list
y_true = flatten(y_true)
y_pred = flatten(y_pred)
#
print("Accuracy: " + str(accuracy_score(y_true, y_pred) * 100))
print("Precision: " + str(precision_score(y_true, y_pred) * 100))
print("Recall: " + str(recall_score(y_true, y_pred) * 100))
print("F-Score: " + str(f1_score(y_true, y_pred) * 100))