In [37]:
##################################################################################################
# fiveguys_2a.ipynb
# Team C: Five Guys: Burgers and Fries
# Aditya Jain, Alex Sandoval, Darrell Harvey, and Fasih Ahsan
# COM SCI X 450.1 Section 362062
# UCLA Extension, Spring 2018
# IPython Notebook for Class Project 2b Submission
# Script creates an EmotiFind function predictEmotion()
# Function input is a UTF-8 .txt file for any article, tweet, etc. Must be in the same folder as this .ipynb. 
# Function output is standard stream print of predicted emotion values for 8 emotions
# Sadness, Anger, Joy, Trust, Fear, Surprise, Disgust, Anticipation
# Must also keep in same folder as .ipynb: EmoLex, TrainingSetEmotions, TrainingSetArticles .txt files.
# Unzip these files from the submission. 
# Requires installation of nltk corpus, csv, NumPy, Matplotlib, SciKit-Learn
# Version 0.0.2  - 05/14/2018
####################################################################################################


# Import required libraries and packages. 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
from collections import defaultdict
import csv
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt

# Import default dictionary for dictionary lists, initialize list and dicts. 
# From collections import defaultdict
emotion_dict = defaultdict(list)
emolexdatalist= []
train_sadness = []
train_anger = []
train_joy = []
train_trust = []
train_fear = []
train_surprise = []
train_disgust = []
train_anticipation = []

# Open and read the EmoLex into a list of lists. 
with open("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", "r") as emolexdata:
    #Skip the header
    next(emolexdata)
    #Save data list as a list of lists for each row. 
    for row in emolexdata:
        row = row.strip().split("\t")
        emolexdatalist.append(row)
    emolexdata.close()

# For each list in the list of lists 
for line in emolexdatalist:
    # If the word is included in the emotion
    if line[2] == "1":
        # And if the emotion is not a sentiment
        if (line[1] != 'negative') and (line[1] != 'positive'):
            # Add the word to the dict of lists
            emotion_dict[line[1]].append(line[0])
        else:
            # Otherwise if it is 0, and if is a sentiment, skip. 
            continue
            
#Open and import the training set emotions.txt
with open("TrainingSetEmotions.txt", "r") as trainingsetemotions:
    # Skip header
    next(trainingsetemotions)
    # Import file as tab delimited text. 
    reader = csv.reader(trainingsetemotions,delimiter='\t')
    for sadness,anger,joy,trust,fear,surprise,disgust,anticipation in reader:
        #Add each article's predicted emotion value to each list. 
        # Ex. train_sadness = [Sadness_Article1, Sadness_Article2, ..., Sadness_Article25]
        train_sadness.append(sadness)
        train_anger.append(anger)
        train_joy.append(joy)
        train_trust.append(trust)
        train_fear.append(fear)
        train_surprise.append(surprise)
        train_disgust.append(disgust)
        train_anticipation.append(anticipation)

# Converting the list to a numpy array.         
train_sadness = np.asarray(train_sadness)
train_anger = np.asarray(train_anger)
train_joy = np.asarray(train_joy)
train_trust = np.asarray(train_trust)
train_fear = np.asarray(train_fear)
train_surprise = np.asarray(train_surprise)
train_disgust = np.asarray(train_disgust)
train_anticipation = np.asarray(train_anticipation)

# @function clean_article()
# @input articleName is the .txt file of the article to clean. 
def clean_article(articleName):
    token_frequency_dic = {}
    with open(articleName,'r') as f:
        text = f.read()

        # split into words
        tokens = word_tokenize(text)

        # convert to lower case
        tokens = [w.lower() for w in tokens]

        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]

        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]

        # Add code to filter and extract proper nouns and named identities. 
        
        # filter out stop words and sort
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        words.sort()
        req = nltk.FreqDist(words)
        for k,v in req.items():
            token_frequency_dic[str(k)] = v
        
        
        # Return dictionary of frequencies per word
        # Ex. dict = {smith : 4, journey : 5, ....}
        return token_frequency_dic

    f.close()

# @function token_checker_1
# @input emotion_dict is the EmoLex ({digust: [gross,nasty,slimy], joy : [fantastic, tremendous, yuge]})
# @input token_frequency is the token_frequency_dict from cleanArticle()
# @output list of frequencies present and absent. 
# @output Ex. [{disgust: {word1 : 1, word2 : 2,...}}, {tremendous:1, yuge : 2}]
#part 1 of checking the words in emption_dict
def token_checker_1(emotion_dict, token_frequency):
    # Initialize two dictionaries
    token_frequency_present = {}
    token_frequency_absent = {}
    #creating a blank dictionary for each emotion based on emotion_dict
    for emotion in emotion_dict:
        token_frequency_present[emotion] = {}
        # dict{disgust:{}, anger:{}, joy:{},...}
        
    for word in token_frequency: #taking the word 
        present = False  #internal check to see if the word is present in any of the emotion dicts
        for emotion in emotion_dict: # going into sad in the emotion dict
            if word in emotion_dict[emotion]: # going into the list of words in sad words
                token_frequency_present[emotion][word]= token_frequency[word]
                # dict{disgust:{word:1}, anger:{}, joy:{},...}
                present = True #will change if 
        # If present is still false after running everything:
        if present == False:
            token_frequency_absent[word]= token_frequency[word] #Same as token_frequency
            #dict{tremendous : 1, yuge :2}
    return [token_frequency_present,token_frequency_absent]
        #[{disgust: {word1 : 1, word2 : 2,...}}, {tremendous:1, yuge : 2}]

# @function token_checker_2
# @input emotion_dict is the EmoLex ({digust: [gross,nasty,slimy], joy : [fantastic, tremendous, yuge]})
# @input token_frequency_present is the frequency of present words from token_checker_1.
# @input token_frequency_absent is the frequency of words absent from the initial pass in token_checker_1. 
# @output list of frequencies present and absent after lemmatization. 
# @output Ex. [{disgust: {word1 : 1, word2 : 2,...}}, {tremendous:1, yuge : 2}]
# part 2 of checking the words in the emotion_dict
def token_checker_2(token_frequency_present,token_frequency_absent,emotion_dict):
    #Initialize lemmatizer. 
    lemmatizer = WordNetLemmatizer()
    token_frequency_absent_l = {}
    for word in token_frequency_absent: #taking the word sorrow
        word_lemma = lemmatizer.lemmatize(word)
        # Check each new lemmatized word to the initial present list
        for word_p in token_frequency_present:
            # If the lemmatized word is literally an emotion (i.e. disgust, fear, joy, etc.)
            if word_lemma is word_p:
                # Append the emotion to the end of the list for said emotion
                # Ex. dict{disgust:{word1:1, word2:2, disgust:1}}
                token_frequency_present[word_p] = token_frequency_present[word_p] + token_frequency_absent[word]
            else:
                token_frequency_absent_l[word_lemma] = token_frequency_absent[word]

    [token_frequency_present_2,token_frequency_absent_2] = token_checker_1(emotion_dict,token_frequency_absent_l)

    for emotion in token_frequency_present_2:
        if emotion == {}:
            continue
        for word in token_frequency_present_2[emotion]:
            token_frequency_present[emotion][word] = token_frequency_present_2[emotion][word]
    return [token_frequency_present,token_frequency_absent_2]


#part 3 for checking the words in the synonyms:
def token_checker_3(token_frequency_present,token_frequency_absent_2,emotion_dict):    
    for word in token_frequency_absent_2:
        #create a list with all synonyms
        syns = wordnet.synsets(word)
        syns_words = []
        for n in range(len(syns)):
            syns_words.append(syns[n].lemmas()[0].name())
        #print(syns_words)
        # go in each word in syns_words to make comparison
        present = False        
        token_frequency_absent_3 = {}
        for word_s in syns_words:
            if present == True:
                break
                #check if it is token_frequency_present, 
                #if yes update the frequency and exit all the for loops except the first one
            if word_s is token_frequency_present:
                token_frequency_present[word_s] = token_frequency_present[word_s] + token_frequency_absent_2[word]
                present = True
                #print(present)
            else:
                #if is is absent in token_frequency_present, 
                #check emotion dictionary and if it is present exit all except first for
                for emotion in emotion_dict: # going into sad in the emotion dict
                    if word_s in emotion_dict[emotion]: # going into the list of words in sad words
                        token_frequency_present[emotion][word]= token_frequency_absent_2[word]
                        present = True
                        #print("present for", word_s)
        #if it is absent in emotion dictionary, add it to token_frequency_present_3
        if present == False:
            token_frequency_absent_3[word]= token_frequency_absent_2[word]
    return [token_frequency_present, token_frequency_absent_3]

def create_param(token_frequency_present,token_frequency):    
    # First parameter is total count of words in emotion. (absolute emotion)
    parameter1 = {}
    for emotion in token_frequency_present:
        parameter1[emotion] = 0
    for emotion in token_frequency_present:
        parameter1[emotion] = sum(token_frequency_present[emotion].values())

    # Second parameter is total count of words in emotion/total count of total words (frequency of emotion)
    parameter2 = {}
    total_words = sum(token_frequency.values())
    for emotion in parameter1:
        parameter2[emotion] = parameter1[emotion]/total_words    
    return [parameter1, parameter2]
    # Ex. [45, 0.4]
    print(parameter1)
    # Could have a multicollinearity issue. 
 
    #May merge this with article_parameters() if time. 
def token_checker(emotion_dict,token_frequency):
    [token_frequency_present,token_frequency_absent_1] = token_checker_1(emotion_dict,token_frequency)
    #print("part 1 token_frequency_present is", token_frequency_present, "\n")
    #print("part 1 token_frequency_absent is", token_frequency_absent_1,"\n")
    #print("part 1 params are", create_param(token_frequency_present,token_frequency))
    [token_frequency_present, token_frequency_absent_2] = token_checker_2(token_frequency_present,token_frequency_absent_1,emotion_dict)
    #print("part 2 token_frequency_present is", token_frequency_present,"\n")
    #print("part 2 token_frequency_absent is", token_frequency_absent_2, "\n")
    #print("part 2 params are", create_param(token_frequency_present,token_frequency))
    [token_frequency_present, token_frequency_absent_3] = token_checker_3(token_frequency_present,token_frequency_absent_2,emotion_dict)
    #print("part 3 token_frequency_present is", token_frequency_present, "\n")
    #print("part 3 token_frequency_absent is", token_frequency_absent_3, "\n")
    #print("part 3 params are", create_param(token_frequency_present,token_frequency))
    return create_param(token_frequency_present,token_frequency)
    # Ex. [45, 0.4]
    #print(parameter1)
    #print(parameter2)

def article_parameters(articleName):
    token_frequency = clean_article(articleName)
    return token_checker(emotion_dict,token_frequency)

# Function predictEmotion()
# Create a model by performing article_parameters() 25 times (once per article)
# Training Input per emotion: ex. Sadness = [[Article1P1,Article1P2], [Article2P1,Article2P2],...,[Article25P1,Article25P2]]
# Training Output per emotion: ex. Sadness = [Article1Emotion, Article2Emotion, Article3Emotion,...Article25Emotion]
# Based on the training output values, uses Support Vector Regression to create a best fit predictor
# Test the model to predict an emotion for a article given [test_articleP1,test_articleP2] for each emotion. 
def predictEmotion(articlename):
    # Initialize the test lists for the input test article. 
    # [test_articleP1,test_articleP2]
    trust_testlist = []
    fear_testlist = []
    sadness_testlist = []
    anger_testlist = []
    surprise_testlist = []
    disgust_testlist = []
    joy_testlist = []
    anticipation_testlist = []
    
    # Initialize the training list for the 25 training articles. 
    # Ex. Sadness = [[Article1P1,Article1P2], [Article2P1,Article2P2],...,[Article25P1,Article25P2]]
    trust_trainlist = []
    fear_trainlist = []
    sadness_trainlist = []
    anger_trainlist = []
    surprise_trainlist = []
    disgust_trainlist = []
    joy_trainlist = []
    anticipation_trainlist = []
    article_reader_list = []
    
    # Create list of articles to implement. 
    for i in range(25):
       article_reader_list.append("train_article" + str(i + 1) + ".txt")

    for article in article_reader_list:
        #train_outputs is [test_articleP1,test_articleP2]
        train_outputs = article_parameters(article)
        #Converts P1{sad:34,joy:42} to listP1[34,42]
        listofP1 = list(train_outputs[0].values())
        listofP2 = list(train_outputs[1].values())
        # [[Article1P1Sad,Article1P2Sad],[Article1P1Joy,Article1P2Joy], ..., [Article1P1Antic, Article1P2Antic]]
        param_merge = [list(each_emot) for each_emot in zip(listofP1, listofP2)]

        trust_trainlist.append(param_merge[0])
        #Ex. Trust = [[Article1P1,Article1P2], [Article2P1,Article2P2],...,[Article25P1,Article25P2]]
        fear_trainlist.append(param_merge[1])
        sadness_trainlist.append(param_merge[2])
        anger_trainlist.append(param_merge[3])
        surprise_trainlist.append(param_merge[4])
        disgust_trainlist.append(param_merge[5])
        joy_trainlist.append(param_merge[6])
        anticipation_trainlist.append(param_merge[7])
        #Ex. Sadness = [[Article1P1,Article1P2], [Article2P1,Article2P2],...,[Article25P1,Article25P2]]
        
    # Convert to NumPy array.
    trust_trainlist = np.asarray(trust_trainlist)
    fear_trainlist = np.asarray(fear_trainlist)
    sadness_trainlist = np.asarray(sadness_trainlist)
    anger_trainlist = np.asarray(anger_trainlist)
    surprise_trainlist = np.asarray(surprise_trainlist)
    disgust_trainlist = np.asarray(disgust_trainlist)
    joy_trainlist = np.asarray(joy_trainlist)
    anticipation_trainlist = np.asarray(anticipation_trainlist)
    
    # Now generate the parameters for the test article. 
    test_article = article_parameters(articlename)
    testlistofP1 = list(test_article[0].values())
    testlistofP2 = list(test_article[1].values())
    test_param_merge = [list(each_emot) for each_emot in zip(testlistofP1, testlistofP2)]

    #trust = [test_articleP1, test_articleP2]
    trust_testlist.append(test_param_merge[0])
    fear_testlist.append(test_param_merge[1])
    sadness_testlist.append(test_param_merge[2])
    anger_testlist.append(test_param_merge[3])
    surprise_testlist.append(test_param_merge[4])
    disgust_testlist.append(test_param_merge[5])
    joy_testlist.append(test_param_merge[6])
    anticipation_testlist.append(test_param_merge[7])
    
    #Convert to Numpy 
    trust_testlist = np.asarray(trust_testlist)
    fear_testlist = np.asarray(fear_testlist)
    sadness_testlist = np.asarray(sadness_testlist)
    anger_testlist = np.asarray(anger_testlist)
    surprise_testlist = np.asarray(surprise_testlist)
    disgust_testlist = np.asarray(disgust_testlist)
    joy_testlist = np.asarray(joy_testlist)
    anticipation_testlist = np.asarray(anticipation_testlist)
    
    # Create support vector regression and performed fit based on the training lists. 
    clf = SVR()
    sadnessfit = clf.fit(sadness_trainlist, train_sadness)
    trustfit = clf.fit(trust_trainlist, train_trust)
    fearfit = clf.fit(fear_trainlist, train_fear)
    angerfit = clf.fit(anger_trainlist, train_anger)
    surprisefit = clf.fit(surprise_trainlist, train_surprise)
    disgustfit = clf.fit(disgust_trainlist, train_disgust)
    joyfit = clf.fit(joy_trainlist, train_joy)
    anticipationfit = clf.fit(anticipation_trainlist, train_anticipation)
    
    # Create prediction of new value from a test_article using the support vector regression model calculated above. 
    sadval = sadnessfit.predict(sadness_testlist)
    trustval = trustfit.predict(trust_testlist)
    fearval = fearfit.predict(fear_testlist)
    angerval = angerfit.predict(anger_testlist)
    surpriseval = surprisefit.predict(surprise_testlist)
    disgustval = disgustfit.predict(disgust_testlist)
    joyval = joyfit.predict(joy_testlist)
    anticipationval = anticipationfit.predict(anticipation_testlist)
    
    # Group all values into a list. 
    valuelist = [sadval[0], angerval[0], joyval[0], trustval[0], fearval[0], surpriseval[0], disgustval[0],
                anticipationval[0]]
    # Iterate through each value in the list. 
    # If the value is less than 0, force to 0
    # If the value is greater than 1, force to 1. 
    # Else keep the value and round to two significant digits. 
    valuelist[:] = [0.00 if value < 0 else 1 if value > 1.00 else value for value in valuelist]
    
    # Print standard output. 
    print("Thank you for using EmotiFind by the Five Guys! For the article you have selected, the predicted emotions are:\n",
         "Sadness:", valuelist[0], "\n", 
         "Anger:", valuelist[1], "\n", 
          "Joy:", valuelist[2], "\n",
          "Trust:", valuelist[3], "\n",
         "Fear:", valuelist[4], "\n",
         "Surprise:", valuelist[5], "\n",
         "Disgust:", valuelist[6], "\n",
         "Anticipation:", valuelist[7], "\n",
         "Thank you! Please try us again soon!")
    

In [38]:
test_reader_list = []

for i in range(25):
    test_reader_list.append("train_article" + str(i + 1) + ".txt")

for article in test_reader_list:
    predictEmotion(article)


Thank you for using EmotiFind by the Five Guys! For the article you have selected, the predicted emotions are:
 Sadness: 0.0 
 Anger: 0.250053834601244 
 Joy: 0.09880697571931346 
 Trust: 0.23585118113484183 
 Fear: 0.09880697571931346 
 Surprise: 0.7489239132192381 
 Disgust: 0.09880697571931346 
 Anticipation: 0.10022281411770509 
 Thank you! Please try us again soon!
Thank you for using EmotiFind by the Five Guys! For the article you have selected, the predicted emotions are:
 Sadness: 0.39950707772081806 
 Anger: 0.7845356375399681 
 Joy: 0.2357051357532984 
 Trust: 0.24727227921377185 
 Fear: 0.0996744453537744 
 Surprise: 0.37323015883738736 
 Disgust: 0.24727173108234368 
 Anticipation: 0.10009121356669573 
 Thank you! Please try us again soon!
Thank you for using EmotiFind by the Five Guys! For the article you have selected, the predicted emotions are:
 Sadness: 0.0 
 Anger: 0.10000166384595949 
 Joy: 0.24563659580119843 
 Trust: 0.4000435410883715 
 Fear: 0.25272418920865497 


KeyboardInterrupt: 