In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

#preprocessing
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
import string

#word embedding
import torch
import torchtext.vocab as vocab

#models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings("ignore")

In [3]:
train_set = pd.read_csv('training_set.txt', sep='\t')
dev_set = pd.read_csv('dev_set.txt', sep='\t')
test_set = pd.read_csv('test_set.txt', sep='\t')

In [4]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer('english')
exclude.remove("!")
exclude.remove("?")

# Don't removing Stop words

In [5]:
def clean(text_list, lemmatize, stemmer):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        text = text.lower()
        #REMOVE THAT IS NOT TEXT
        text = re.sub("[0-9]", ' ', text)
        
        #REMOVE STOP WORDS
        #text = " ".join([word for word in text.split() if word not in stop ])
        
        text = " ".join([word for word in text.split() if word not in exclude ])
        
        #LEMMATIZATION
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
            
       
        updates.append(text)
        
    return updates

In [6]:

train_set['sentence'] = clean(train_set['sentence'], lemmatize = True, stemmer = False)
dev_set['sentence'] = clean(dev_set['sentence'], lemmatize = True, stemmer = False)

  0%|          | 0/14000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
tfid = TfidfVectorizer(max_features=14000,ngram_range=(1,2))
X = tfid.fit_transform(train_set['sentence'])
y = np.array(train_set["emotion"])
Test = tfid.transform(dev_set['sentence'])
y_hat = np.array(dev_set["emotion"])
model = SVC(C=0.85,kernel='linear')
model.fit(X,y)
predict = model.predict(Test)
labels = {"Anger": 1, "Anticipation": 2, "Disgust": 3, "Fear": 4, "Joy": 5, "Sadness": 6, "Surprise": 7, "Trust": 8}
print (classification_report(predict, y_hat, target_names=labels.keys()))

              precision    recall  f1-score   support

       Anger       0.61      0.35      0.44       369
Anticipation       0.51      0.44      0.48       196
     Disgust       0.09      0.22      0.13        32
        Fear       0.24      0.37      0.29        68
         Joy       0.38      0.45      0.41        82
     Sadness       0.29      0.37      0.32        67
    Surprise       0.18      0.39      0.24        44
       Trust       0.36      0.40      0.38       142

    accuracy                           0.38      1000
   macro avg       0.33      0.37      0.34      1000
weighted avg       0.45      0.38      0.40      1000



# Using FastText

In [24]:
def clean(text_list, lemmatize, stemmer):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        text = text.lower()
        #REMOVE THAT IS NOT TEXT
        text = re.sub("[0-9]", ' ', text)
        text = re.sub("'", ' ', text, flags=re.I)
        text = re.sub('(\[|\])',' ',text)
        
        text = re.sub(':','',text) 
        text = re.sub(';','',text) 
        text = re.sub('--','',text) 
        text = re.sub('soltáis','',text)
        text = re.sub('empathicalist','',text)
        text = re.sub('\*','',text)
        text = re.sub('-',' - ',text)
        text = re.sub('nuggies','',text)
        text = re.sub('sourcastic','',text)
        text = re.sub('buljanoff','person',text)
        text = re.sub('verua','',text)
        text = re.sub('amalgate','combine',text)
        text = re.sub('approvechamos','',text) 
        text = re.sub('dogl','dog',text) 
        text = re.sub('helaros','ice cream',text)
        text = re.sub('ruuuuuth !','ruth !!!!!',text)
        text = re.sub('radarjockeys','',text)
        text = re.sub('favrinis','favrini',text)
        text = re.sub('tadminster','',text)
        text = re.sub('buckoes','bucko',text)
        text = re.sub('päätti','',text)
        text = re.sub('teidät','',text)
        text = re.sub('tilalle','',text)
        text = re.sub('volavent','pie',text)
        text = re.sub('päiväsi on lopussa','your day is over',text)
        text = re.sub('obsolescing','obsolesce',text)
        text = re.sub('adiran','',text)
        text = re.sub('shikseh','food',text)
        text = re.sub('hundjager','animal',text)
        text = re.sub('devriess','person',text)
        text = re.sub('shalakazam','magic',text)
        text = re.sub('=','equal',text)
        text = re.sub('dovitch ','person',text)
        text = re.sub('chewgood ','',text)
        text = re.sub('kamoja ','',text)
        text = re.sub('hourses ','horses',text)
        text = re.sub('disppearance','disappearance',text)
        text = re.sub('monosyllabically','',text)
        text = re.sub('alonger','',text)
        text = re.sub('oextry','',text)
        text = re.sub('azupep', '',text)
        text = re.sub('suitcoat', '',text)
        text = re.sub('scuzzball', 'disgusting person',text)
        text = re.sub('bited', 'bit',text)
        text = re.sub('antinuke', 'anti nuke',text)
        text = re.sub('jouncing', 'jounce',text)
        text = re.sub('neverjoke', 'never joke',text)
        text = re.sub('couid', 'could',text)
        text = re.sub('lmprovise', 'improvise',text)
        text = re.sub('pecmans', '',text)
        text = re.sub('buttissimo', 'book',text)
        text = re.sub('rakonin', '',text)
        
        aux = []
        for i in text.split():
            i = re.sub(r'[\w\s]+[?.!:]+$', i.translate(str.maketrans('', '', string.punctuation)), i)
            aux.append(re.sub(r'[.!?:]+', ' '.join(i), i))

        text = ' '.join(aux)

        
        #REMOVE STOP WORDS
        #text = " ".join([word for word in text.split() if word not in stop ])
        
        #text = " ".join([word for word in text.split() if word not in exclude ])
        
        #LEMMATIZATION
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
            
        text = re.sub('breechclouted','breechclout',text) 
        text = re.sub('coldcocked','coldcock',text) 
        text = re.sub('upchucked','upchuck',text) 
        
        
        updates.append(text)
        
    return updates

In [25]:
train_set = pd.read_csv('training_set.txt', sep='\t')
dev_set = pd.read_csv('dev_set.txt', sep='\t')
test_set = pd.read_csv('test_set.txt', sep='\t')

train_set['sentence'] = clean(train_set['sentence'], lemmatize = False, stemmer = False)
dev_set['sentence'] = clean(dev_set['sentence'], lemmatize = False, stemmer = False)
test_set['sentence'] = clean(test_set['sentence'], lemmatize = False, stemmer = False)

  0%|          | 0/14000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [10]:
#downloading the vocab from the FastText
fasttext = vocab.FastText()
print('Loaded {} words'.format(len(fasttext.itos)))

.vector_cache\wiki.en.vec: 6.60GB [05:18, 20.7MB/s]                                                                    
  0%|                                                                                      | 0/2519370 [00:00<?, ?it/s]Skipping token b'2519370' with 1-dimensional vector [b'300']; likely a header
100%|██████████████████████████████████████████████████████████████████████| 2519370/2519370 [08:03<00:00, 5208.20it/s]


Loaded 2519370 words


In [27]:


def sentenc_to_vector(df):
    '''
    Function to transform sentence to a vector of numbers using the FastText embedding 
    df: data frame with a column namede 'sentence' that will be transformed.
    To transform the sentence into a vector of numbers we took the mean
    '''
    emb = []
    for i in range(len(df)):
        aux=[]
        words = df['sentence'][i].split()
        for k in words:
            aux.append(np.array(fasttext.vectors[fasttext.stoi[k]]))
        emb.append(pd.Series(np.mean(aux,axis=0)))

    return pd.DataFrame(emb)

emb_df_train = sentenc_to_vector(train_set)
emb_df_dev = sentenc_to_vector(dev_set)
emb_df_test = sentenc_to_vector(test_set)

In [29]:


log_reg_params = [{"C":0.01},
                  {"C":0.1},
                  {"C":1},
                  {"C":1, "max_iter":10000},
                  {"C":1, "solver":'newton-cg', "max_iter":10000},
                  {"C":1, "solver":'liblinear'},
                  {"C":10, "solver":'liblinear'},
                  {"C":10, "solver":'newton-cg'},
                  {"C":10, "max_iter":10000},
                  {"C":100, "max_iter":10000}]

dec_tree_params = [{"criterion": "gini"}, {"criterion": "entropy"}]

rand_for_params = [{"criterion": "gini"}, {"criterion": "entropy"},
                   {"criterion": "gini","n_estimators":1000, "max_depth":50}
                  ]

kneighbors_params = [{"n_neighbors":3}, {"n_neighbors":5}, {"n_neighbors":10}]

naive_bayes_params = [{}]
svc_params = [{"C" :0.10, "kernel":'rbf' ,"degree": 3, "gamma":'scale'}, {"C":0.1}, {"C":1}, {"C":10}  ]

modelclasses = [
    ["log regression", LogisticRegression, log_reg_params],
    ["decision tree", DecisionTreeClassifier, dec_tree_params],
    ["random forest", RandomForestClassifier, rand_for_params],
    ["k neighbors", KNeighborsClassifier, kneighbors_params],
    ["naive bayes", GaussianNB, naive_bayes_params],
    ["support vector machines", SVC, svc_params]
]

x_train = np.array(emb_df_train)
y_train = np.array(train_set["emotion"])
x_test = np.array(emb_df_dev)
y_test = np.array(dev_set["emotion"])


insights = []
for modelname, Model, params_list in modelclasses:
    for params in params_list:
        model = Model(**params)
        model.fit(x_train, y_train)
        score = model.score(x_test, y_test)
        insights.append((modelname, model, params, score))

In [30]:
insights.sort(key=lambda x:x[-1], reverse=True)
for modelname, model, params, score in insights:
    print(modelname, params, score)

support vector machines {'C': 10} 0.429
support vector machines {'C': 1} 0.428
log regression {'C': 1} 0.421
log regression {'C': 1, 'max_iter': 10000} 0.421
log regression {'C': 1, 'solver': 'newton-cg', 'max_iter': 10000} 0.42
log regression {'C': 1, 'solver': 'liblinear'} 0.419
log regression {'C': 10, 'solver': 'liblinear'} 0.41
log regression {'C': 10, 'solver': 'newton-cg'} 0.41
log regression {'C': 10, 'max_iter': 10000} 0.41
log regression {'C': 100, 'max_iter': 10000} 0.41
log regression {'C': 0.1} 0.401
random forest {'criterion': 'gini', 'n_estimators': 1000, 'max_depth': 50} 0.359
support vector machines {'C': 0.1, 'kernel': 'rbf', 'degree': 3, 'gamma': 'scale'} 0.336
support vector machines {'C': 0.1} 0.336
random forest {'criterion': 'gini'} 0.335
random forest {'criterion': 'entropy'} 0.331
k neighbors {'n_neighbors': 10} 0.307
log regression {'C': 0.01} 0.298
k neighbors {'n_neighbors': 5} 0.287
naive bayes {} 0.28
k neighbors {'n_neighbors': 3} 0.262
decision tree {'cr

# Best Model

In [31]:
model = SVC(C= 0.9)
model.fit(x_train,y_train)
predict = model.predict(x_test)
labels = {"Anger": 1, "Anticipation": 2, "Disgust": 3, "Fear": 4, "Joy": 5, "Sadness": 6, "Surprise": 7, "Trust": 8}
print (classification_report(predict, y_test, target_names=labels.keys()))

              precision    recall  f1-score   support

       Anger       0.65      0.40      0.49       346
Anticipation       0.49      0.44      0.46       187
     Disgust       0.22      0.52      0.31        33
        Fear       0.23      0.46      0.31        52
         Joy       0.45      0.47      0.46        94
     Sadness       0.37      0.41      0.39        79
    Surprise       0.24      0.41      0.30        56
       Trust       0.44      0.46      0.45       153

    accuracy                           0.43      1000
   macro avg       0.39      0.44      0.40      1000
weighted avg       0.49      0.43      0.44      1000



In [32]:
print (classification_report(model.predict(x_train), y_train, target_names=labels.keys()))

              precision    recall  f1-score   support

       Anger       0.71      0.42      0.53      5049
Anticipation       0.55      0.48      0.51      2440
     Disgust       0.26      0.60      0.36       581
        Fear       0.25      0.61      0.35       576
         Joy       0.54      0.65      0.59      1233
     Sadness       0.41      0.49      0.45      1156
    Surprise       0.32      0.52      0.40       708
       Trust       0.48      0.45      0.46      2257

    accuracy                           0.48     14000
   macro avg       0.44      0.53      0.46     14000
weighted avg       0.55      0.48      0.49     14000



# Exporting the results form Dev set and Test set

In [98]:
dev_new=pd.concat([dev_set['sentence'],pd.Series(model.predict(x_test))], axis=1).rename(columns={0:'emotion_pred'})
dev_new.to_csv('dev_set_prep.txt',sep='\t')

In [39]:
test = np.array(emb_df_test)

test_new=pd.concat([test_set['sentence'],pd.Series(model.predict(test))], axis=1).rename(columns={0:'emotion_pred'})
test_new.to_csv('test_set_prep.txt',sep='\t')