In [2]:
# required imports
from sklearn.feature_extraction import text 
from textblob import TextBlob
from nltk.corpus import wordnet as wn
import pandas as pd
import random
from nltk.classify import accuracy, NaiveBayesClassifier
import numpy as np
import time

In [1]:
# vector is defined according to features identified 
# over the tfm document

global_features = {}

# origin type
global_features['origin'] = []
# joy
global_features['origin'].append('pleasant')
global_features['origin'].append('wellness')
# sadness
global_features['origin'].append('setback')
# anger
global_features['origin'].append('frustration')
global_features['origin'].append('adversity')
# fear
global_features['origin'].append('danger')
# surprise
global_features['origin'].append('stimulus')
global_features['origin'].append('unexpected')
global_features['origin'].append('intensity')


# effect type
global_features['effect'] = []

# consequence type
global_features['consequence'] = []
# joy
global_features['consequence'].append('smile')
global_features['consequence'].append('interaction')
global_features['consequence'].append('creativity')
# sadness
global_features['consequence'].append('refusal')
global_features['consequence'].append('misfortune')
global_features['consequence'].append('weakness')
# anger
global_features['consequence'].append('destroy')
global_features['consequence'].append('aggressiveness')
global_features['consequence'].append('hostile')
# fear
global_features['consequence'].append('obsession')
global_features['consequence'].append('panic')
global_features['consequence'].append('phobia')
global_features['consequence'].append('anxiety')
# surprise
global_features['consequence'].append('setback')

In [4]:
# some common functions 

# used to avoid accessing the global features var directly
def get_features():
    return global_features

# used to remove stopwords from a phrase
def get_clean_phrase(phrase):
    return ' '.join(word for (word) in phrase.split() if word not in text.ENGLISH_STOP_WORDS)

In [8]:
# dataset is loaded into different dataframes
df_test = pd.read_csv("data/test.txt", sep=";", names=['phrase', 'feeling'])
df_train = pd.read_csv("data/train.txt", sep=";", names=['phrase', 'feeling'])
df_val = pd.read_csv("data/val.txt", sep=";", names=['phrase', 'feeling'])

# dataframes are joined
df = pd.concat([df_test, df_train, df_val])


# remove "love" feeling as we do not have features for it
data_without_love = df[df['feeling']!= 'love'] 

# we print the phrase count for each feeling
print(data_without_love.groupby(['feeling']).agg(['count']))


         phrase
          count
feeling        
anger      2709
fear       2373
joy        6761
sadness    5797
surprise    719


In [9]:

# since the min amount of phrases is 719 for one feeling
# we create a subset to avoid overtraining
# and by doing this, we balance the dataset
data_filtered = pd.concat([data_without_love[data_without_love['feeling']== 'anger'].sample(720), \
                          data_without_love[data_without_love['feeling']== 'fear'].sample(720), \
                          data_without_love[data_without_love['feeling']== 'joy'].sample(720), 
                          data_without_love[data_without_love['feeling']== 'sadness'].sample(720),
                          data_without_love[data_without_love['feeling']== 'surprise']])
    
#data_filtered = data_without_love
print(data_filtered.groupby(['feeling']).agg(['count']))

         phrase
          count
feeling        
anger       720
fear        720
joy         720
sadness     720
surprise    719
