In [40]:
import pandas as pd
import numpy as np
import sklearn.neural_network # Perceptón multicapa
import sklearn.metrics # Métricas
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

In [2]:
def txt_to_df(file_name):   
    with open(file_name, encoding="utf-8") as f:
        data = []
        for l in f.read().split("\n")[:-1]:
            o = {}
            for k, v in enumerate(l.split("\t")):
                col_name = "id text sen int".split()[k]
                o[col_name] = v
            data.append(o)
    return pd.DataFrame(data).set_index("id")

In [3]:
sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()
sentiment_map = {sentiments[x]:x for x in range(len(sentiments))}
intensity_map = {intensities[x]:x for x in range(len(intensities))}

root_data = "assignment_1/data"
file_names = {
    "train/anger": "/train/anger-train.txt",
    "train/fear": "/train/fear-train.txt",
    "train/joy": "/train/joy-train.txt",
    "train/sadness": "/train/sadness-train.txt"
}

In [4]:
df_train = pd.DataFrame()
for sen in sentiments:
    file_name = root_data+file_names[f"train/{sen}"]
    df_sen = txt_to_df(file_name)
    df_train = pd.concat([df_train, df_sen], axis=0)

In [5]:
df_train['sen_id'] = df_train['sen'].apply(lambda x: sentiment_map[x])
df_train['int_id'] = df_train['int'].apply(lambda x: intensity_map[x])
df_train

Unnamed: 0_level_0,text,sen,int,sen_id,int_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,How the fu*k! Who the heck! moved my fridge!.....,anger,high,0,2
10001,So my Indian Uber driver just called someone t...,anger,high,0,2
10002,@DPD_UK I asked for my parcel to be delivered ...,anger,high,0,2
10003,so ef whichever butt wipe pulled the fire alar...,anger,high,0,2
10004,Don't join @BTCare they put the phone down on ...,anger,high,0,2
...,...,...,...,...,...
40855,Common app just randomly logged me out as I wa...,sadness,high,3,2
40856,"I'd rather laugh with the rarest genius, in be...",sadness,high,3,2
40857,If you #invest in my new #film I will stop ask...,sadness,medium,3,1
40858,"Just watched Django Unchained, Other people ma...",sadness,low,3,0


In [6]:
print(df_train['text'][0])

How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##


# Lemmatization

In [7]:
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import word_tokenize
import re

In [8]:
def lemmatize(tweet, steemer = EnglishStemmer()):
    tweet = re.sub(r'@[^\s]+', '', tweet)
    splited_tweet = word_tokenize(tweet.lower())
    steemed_tweet = " ".join([steemer.stem(x) for x in splited_tweet])
    return steemed_tweet

In [9]:
df = df_train
df['text'] = df['text'].apply(lemmatize)
df

Unnamed: 0_level_0,text,sen,int,sen_id,int_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,how the fu * k ! who the heck ! move my fridg ...,anger,high,0,2
10001,so my indian uber driver just call someon the ...,anger,high,0,2
10002,i ask for my parcel to be deliv to a pick up s...,anger,high,0,2
10003,so ef whichev butt wipe pull the fire alarm in...,anger,high,0,2
10004,"do n't join they put the phone down on you , t...",anger,high,0,2
...,...,...,...,...,...
40855,common app just random log me out as i was wri...,sadness,high,3,2
40856,"i 'd rather laugh with the rarest genius , in ...",sadness,high,3,2
40857,if you # invest in my new # film i will stop a...,sadness,medium,3,1
40858,"just watch django unchain , other peopl may fr...",sadness,low,3,0


# Feature Extraction

In [10]:
def search_elongations(text):
    return {'elongations': len(re.findall(r"(.)\1{2}", text))}

In [21]:
def token_count(txt):
    return {'tokens': len(word_tokenize(txt))}

In [12]:
def vowel_count(text):
    return {'vowels': len(re.findall(r"[aeiou]", text))}

In [13]:
def const_count(text):
    return {'const': len(re.findall(r"[a-z-[aeiou]]", text))}

In [14]:
def search_negations(text):
    return {'negations': len(re.findall(r"^(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)|.*n't", text))}

In [15]:
def get_emoticons(text):
    regex = r"(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)"
    return re.findall(regex, text)

def emo_score(text):
    posCount = 0
    negCount = 0
    totalCount = 0
    totalScore = 0
    emoticons =  get_emoticons(text)
    for emo in emoticons:
        totalCount+=1
        if(emo.endswith("(") or emo.endswith("[") or emo.endswith("<") or emo.endswith("/") or emo.lower().endswith("c") or emo.startswith(")") or emo.startswith("]") or emo.startswith(">") or emo.startswith("\\") or emo.startswith("D")):
            negCount+=1
            totalScore = totalScore - 1
        if(emo.endswith(")") or emo.endswith("]") or emo.endswith(">") or emo.endswith("D") or emo.startswith("(") or emo.startswith("[") or emo.startswith("<") or emo.lower().startswith("c")):
            posCount+=1
            totalScore = totalScore + 1
            
    return {'positiveEmo': posCount, 'negativeEmo': negCount, 'totalEmo': totalCount, 'scoreEmo': totalScore }

In [16]:
def search_marks(text):
    return {'marks': len(re.findall(r"!|\?|\*|\.\.+", text))}    

In [17]:
def search_hashtags(text):
    return {'hashtags': len(re.findall(r"# \w+", text))}    


In [18]:
POSITIVE_WORDS = set(line.strip() for line in open('positive.txt'))
NEGATIVE_WORDS = set(line.strip() for line in open('negative.txt'))

def find_positive_negative_words(text):
    words = text.split(" ")
    pos = 0
    neg = 0
    for word in words:
        if word in NEGATIVE_WORDS:
            neg += 1
        elif word in POSITIVE_WORDS:
            pos += 1
    
    return {'positive': pos, 'negative': neg}

In [19]:
def create_feature_map(text):
    extractors = [search_elongations, search_negations, emo_score, search_marks, search_hashtags, find_positive_negative_words, token_count, vowel_count, const_count]
    features = {}
    for extractor in extractors:
        features.update(extractor(text))
    return features

In [28]:
datum = []
features = []
labels = [] 

for text, sen, intensity in zip(df.text, df.sen_id, df.int_id):
    d = create_feature_map(text)
    d.update({'sen': sen, 'int': intensity})
    
    datum.append(d)
    features.append(d.values())
    labels.append(sen)

proccessed_df = pd.DataFrame(datum)

In [29]:
proccessed_df = shuffle(proccessed_df)

In [30]:
proccessed_df

Unnamed: 0,elongations,negations,positiveEmo,negativeEmo,totalEmo,scoreEmo,marks,hashtags,positive,negative,tokens,vowels,const,sen,int
229,0,0,0,0,0,0,0,0,0,2,11,14,0,0,1
3591,0,0,0,0,0,0,0,1,0,3,10,8,0,3,1
1571,0,0,0,0,0,0,0,0,0,1,7,13,0,1,1
1237,0,0,0,0,1,0,1,3,1,2,26,30,0,1,1
1874,0,0,0,0,0,0,0,0,2,0,11,15,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1377,0,0,0,0,0,0,0,0,0,2,10,11,0,1,1
484,0,0,0,0,0,0,3,1,0,2,29,28,0,0,1
2848,0,0,0,0,0,0,0,0,0,0,3,7,0,2,0
2777,0,0,0,0,0,0,0,0,0,1,12,17,0,2,1


In [31]:
# Training

In [49]:
sent_proccessed_df = proccessed_df[proccessed_df['sen'] == 0]

features = sent_proccessed_df.iloc[:,:-1]
labels = sent_proccessed_df.int

data_features_train, data_features_test, data_label_train, data_label_test = sklearn.model_selection.train_test_split(
    features,
    labels,
    test_size=0.2
)



In [54]:
mlp = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes = (50, 100, 200),
    max_iter = 2000,
)

mlp.fit(
    data_features_train,
    data_label_train
)

print(sklearn.metrics.classification_report(
    data_label_test,
    mlp.predict(data_features_test)
))

              precision    recall  f1-score   support

           0       0.38      0.18      0.24        34
           1       0.67      0.89      0.76       125
           2       0.29      0.07      0.11        30

    accuracy                           0.63       189
   macro avg       0.44      0.38      0.37       189
weighted avg       0.56      0.63      0.56       189



In [55]:
clf = RandomForestClassifier()

clf.fit(
    data_features_train,
    data_label_train
)

print(sklearn.metrics.classification_report(
    data_label_test,
    clf.predict(data_features_test)
))

              precision    recall  f1-score   support

           0       0.50      0.18      0.26        34
           1       0.68      0.91      0.78       125
           2       0.22      0.07      0.10        30

    accuracy                           0.65       189
   macro avg       0.47      0.39      0.38       189
weighted avg       0.57      0.65      0.58       189

