## Emotion recognition using NLP: Affective Computing 2023

Danila Goncharenko, 2303788

Ana Ferreira, 2308587

Luca Hustiuc, 2209104

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("tweet_emotions.csv")

In [4]:
from transformers import AutoModelForSequenceClassification, pipeline, AutoTokenizer

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

model_nm = "cardiffnlp/twitter-roberta-base"
fill_mask = pipeline("fill-mask", model=model_nm, tokenizer=model_nm)
tokenizer = AutoTokenizer.from_pretrained(model_nm)

def print_candidates():
    for i in range(5):
        token = tokenizer.decode(candidates[i]['token'])
        score = np.round(candidates[i]['score'], 4)
        print(f"{i+1}) {token} {score}")

texts = [
 "I am so <mask> 😊",
 "I am so <mask> 😢" 
]

for text in texts:
    t = preprocess(text)
    print(f"{'-'*30}\n{t}")
    candidates = fill_mask(t)
    print_candidates()


In [None]:
from transformers import AutoTokenizer, AutoModel, TFAutoModel
import numpy as np

MODEL = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

text = "Good night 😊"
text = preprocess(text)

# Pytorch
model = AutoModel.from_pretrained(MODEL)
encoded_input = tokenizer(text, return_tensors='pt')
features = model(**encoded_input)
features = features[0].detach().cpu().numpy() 
features_mean = np.mean(features[0], axis=0) 
#features_max = np.max(features[0], axis=0)

# # Tensorflow
# model = TFAutoModel.from_pretrained(MODEL)
# encoded_input = tokenizer(text, return_tensors='tf')
# features = model(encoded_input)
# features = features[0].numpy()
# features_mean = np.mean(features[0], axis=0) 
# #features_max = np.max(features[0], axis=0)


In [None]:
LABELS = ['neutral', 'worry', 'hapiness', 'sadness', 'love']

def binarize_labels(labels):
    return [int(i in list(map(int,labels[1:-1].split(',')))) for i in range(len(LABELS))]
binarize_labels(df['labels'][0])

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=len(LABELS), problem_type="multi_label_classification")

model.config.label2id = {label: i for i, label in enumerate(LABELS)}
model.config.id2label = {i: label for i, label in enumerate(LABELS)}


In [None]:
# train
# validate
# test


In [None]:
import torch
torch.cuda.is_available()

In [None]:
from torch import nn
def get_pred(text):
    inputs = tokz(text, return_tensors="pt")
    inputs.to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
    percent = nn.functional.softmax(logits, dim=1)
    predicted_class_id = logits.argmax().item()
    res = model.config.id2label[predicted_class_id]
    return percent, predicted_class_id, res
get_pred('This text has no emotion')

In [None]:
from tqdm.auto import tqdm, trange
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
def predict_with_model(model, dataloader):
    preds = []
    facts = []

    for batch in tqdm(dataloader):
        facts.append([list(map(bool,batch['label']))])
        inputs = tokz(batch['text'], return_tensors="pt")
        inputs.to(model.device)
        with torch.no_grad():
            logits = model(**inputs).logits
        preds.append(nn.functional.softmax(logits, dim=1).cpu())
    facts = np.concatenate(facts)
    preds = np.concatenate(preds)
    return facts, preds
def eval_model(preds, facts):
    aucs = [roc_auc_score(facts[:, i], preds[:, i]) for i in range(len(LABELS))]
    print('aucs:', aucs)
    return {'accuracy': np.mean(aucs)}

In [None]:
factswm, predwm = predict_with_model(model, test_ds.select(range(100)))
eval_model(predwm, factswm)

In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
def compute_metrics(eval_pred):
    preds, facts = eval_pred
    return eval_model(preds, facts)

In [None]:
training_args = TrainingArguments(
    output_dir="test",
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    num_train_epochs=5, 
    weight_decay=0.01,
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    metric_for_best_model = "accuracy", 
    load_best_model_at_end=True, 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokz,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
get_pred('This text has no emotion.')

In [None]:
get_pred('Im afraid Ill fail')

In [None]:
get_pred('Glad to see you!')

In [None]:
get_pred('I want to die')

In [None]:
get_pred('OMG, so romantic')

In [None]:
from sklearn.metrics import f1_score
pd.DataFrame([
    {av: f1_score(factswm[:, i], predwm[:, i] > 0.5, average=av) for av in ['binary', 'micro', 'macro']}
    for i in range(len(LABELS))
]).round(4)

In [None]:
pd.DataFrame([
    {av: f1_score(factswm[:, i], predwm[:, i] > 0.5, average=av) for av in ['binary', 'micro', 'macro']}
    for i in range(len(LABELS))
]).mean().round(4)