In [2]:
import pandas as pd
import numpy as np
import sklearn.neural_network # Perceptón multicapa
import sklearn.metrics # Métricas

In [4]:
def txt_to_df(file_name):   
    with open(file_name, encoding="utf-8") as f:
        data = []
        for l in f.read().split("\n")[:-1]:
            o = {}
            for k, v in enumerate(l.split("\t")):
                col_name = "id text sen int".split()[k]
                o[col_name] = v
            data.append(o)
    return pd.DataFrame(data).set_index("id")

In [114]:
sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()
sentiment_map = {sentiments[x]:x for x in range(len(sentiments))}
intensity_map = {intensities[x]:x for x in range(len(intensities))}

root_data = "assignment_1/data"
file_names = {
    "train/anger": "/train/anger-train.txt",
    "train/fear": "/train/fear-train.txt",
    "train/joy": "/train/joy-train.txt",
    "train/sadness": "/train/sadness-train.txt"
}

In [115]:
df_train = pd.DataFrame()
for sen in sentiments:
    file_name = root_data+file_names[f"train/{sen}"]
    df_sen = txt_to_df(file_name)
    df_train = pd.concat([df_train, df_sen], axis=0)

In [116]:
df_train['sen_id'] = df_train['sen'].apply(lambda x: sentiment_map[x])
df_train['int_id'] = df_train['int'].apply(lambda x: intensity_map[x])
df_train

Unnamed: 0_level_0,text,sen,int,sen_id,int_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,How the fu*k! Who the heck! moved my fridge!.....,anger,high,0,2
10001,So my Indian Uber driver just called someone t...,anger,high,0,2
10002,@DPD_UK I asked for my parcel to be delivered ...,anger,high,0,2
10003,so ef whichever butt wipe pulled the fire alar...,anger,high,0,2
10004,Don't join @BTCare they put the phone down on ...,anger,high,0,2
...,...,...,...,...,...
40855,Common app just randomly logged me out as I wa...,sadness,high,3,2
40856,"I'd rather laugh with the rarest genius, in be...",sadness,high,3,2
40857,If you #invest in my new #film I will stop ask...,sadness,medium,3,1
40858,"Just watched Django Unchained, Other people ma...",sadness,low,3,0


In [117]:
print(df_train['text'][0])

How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##


# Lemmatization

In [118]:
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize import word_tokenize
import re

In [119]:
def lemmatize(tweet, steemer = EnglishStemmer()):
    tweet = re.sub(r'@[^\s]+', '', tweet)
    splited_tweet = word_tokenize(tweet.lower())
    steemed_tweet = " ".join([steemer.stem(x) for x in splited_tweet])
    return steemed_tweet

In [120]:
df = df_train
df['text'] = df['text'].apply(lemmatize)
df

Unnamed: 0_level_0,text,sen,int,sen_id,int_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,how the fu * k ! who the heck ! move my fridg ...,anger,high,0,2
10001,so my indian uber driver just call someon the ...,anger,high,0,2
10002,i ask for my parcel to be deliv to a pick up s...,anger,high,0,2
10003,so ef whichev butt wipe pull the fire alarm in...,anger,high,0,2
10004,"do n't join they put the phone down on you , t...",anger,high,0,2
...,...,...,...,...,...
40855,common app just random log me out as i was wri...,sadness,high,3,2
40856,"i 'd rather laugh with the rarest genius , in ...",sadness,high,3,2
40857,if you # invest in my new # film i will stop a...,sadness,medium,3,1
40858,"just watch django unchain , other peopl may fr...",sadness,low,3,0


# Feature Extraction

In [121]:
def search_elongations(text):
    return {'elongations': len(re.findall(r"(.)\1{2}", text))}

In [122]:
def search_negations(text):
    return {'negations': len(re.findall(r"^(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)|.*n't", text))}

In [123]:
def get_emoticons(text):
    regex = r"(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)"
    return re.findall(regex, text)

def emo_score(text):
    posCount = 0
    negCount = 0
    totalCount = 0
    totalScore = 0
    emoticons =  get_emoticons(text)
    for emo in emoticons:
        totalCount+=1
        if(emo.endswith("(") or emo.endswith("[") or emo.endswith("<") or emo.endswith("/") or emo.lower().endswith("c") or emo.startswith(")") or emo.startswith("]") or emo.startswith(">") or emo.startswith("\\") or emo.startswith("D")):
            negCount+=1
            totalScore = totalScore - 1
        if(emo.endswith(")") or emo.endswith("]") or emo.endswith(">") or emo.endswith("D") or emo.startswith("(") or emo.startswith("[") or emo.startswith("<") or emo.lower().startswith("c")):
            posCount+=1
            totalScore = totalScore + 1
            
    return {'positiveEmo': posCount, 'negativeEmo': negCount, 'totalEmo': totalCount, 'scoreEmo': totalScore }

In [124]:
def search_marks(text):
    return {'marks': len(re.findall(r"!|\?|\*|\.\.+", text))}    

In [129]:
def search_hashtags(text):
    return {'hashtags': len(re.findall(r"# \w+", text))}    


In [130]:
POSITIVE_WORDS = set(line.strip() for line in open('positive.txt'))
NEGATIVE_WORDS = set(line.strip() for line in open('negative.txt'))

def find_positive_negative_words(text):
    words = text.split(" ")
    pos = 0
    neg = 0
    for word in words:
        if word in NEGATIVE_WORDS:
            neg += 1
        elif word in POSITIVE_WORDS:
            pos += 1
    
    return {'positive': pos, 'negative': neg}

In [131]:
def create_feature_map(text):
    extractors = [search_elongations, search_negations, emo_score, search_marks, search_hashtags, find_positive_negative_words]
    features = {}
    for extractor in extractors:
        features.update(extractor(text))
    return features

In [138]:
datum = []
features = []
labels = [] 

for text, sen in zip(df.text, df.sen_id):
    d = create_feature_map(text)
    d.update({'sen': sen})
    
    datum.append(d)
    features.append(d.values())
    labels.append(sen)

proccessed_df = pd.DataFrame(datum)

In [139]:
proccessed_df

Unnamed: 0,elongations,negations,positiveEmo,negativeEmo,totalEmo,scoreEmo,marks,hashtags,positive,negative,sen
0,1,0,0,0,0,0,5,2,0,3,0
1,0,1,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,0,0,2,0,1,0
3,0,0,0,0,0,0,0,7,0,3,0
4,0,1,0,0,0,0,1,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...
3955,1,0,0,0,0,0,0,0,0,1,3
3956,0,0,0,0,0,0,0,1,1,1,3
3957,0,0,0,0,0,0,0,7,0,3,3
3958,0,0,0,0,0,0,1,0,1,1,3


In [137]:
# Training

In [141]:


features = proccessed_df.iloc[:,:-1]

labels = proccessed_df.sen

data_features_train, data_features_test, data_label_train, data_label_test = sklearn.model_selection.train_test_split(
    features,
    labels,
    test_size=0.1
)



In [147]:
mlp = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes = (50, 100, 200),
    max_iter = 10000,
)
mlp.fit(
    data_features_train,
    data_label_train
)
print(sklearn.metrics.classification_report(
    data_label_test,
    mlp.predict(data_features_test)
))

              precision    recall  f1-score   support

           0       0.27      0.23      0.25        87
           1       0.36      0.43      0.39       134
           2       0.48      0.52      0.50        94
           3       0.25      0.20      0.22        81

    accuracy                           0.36       396
   macro avg       0.34      0.34      0.34       396
weighted avg       0.35      0.36      0.35       396

