In [None]:
#Importing necessary libraries

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy
import csv
import matplotlib.pyplot as plt
import seaborn as sns



import transformers
from transformers                     import  RobertaModel, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import XLMRobertaForSequenceClassification,AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
import torch

from tqdm import tqdm


from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.model_selection import train_test_split


from torch                            import nn, optim
from torch.utils                      import data
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss


#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False


CLASS_NAMES = ['sadness', 'joy', 'anger', 'surprise', 'disgust', 'fear', 'others']
MAX_LENGTH = 200
BATCH_SIZE = 16
EPOCHS = 5
MODEL = "cardiffnlp/twitter-xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL,use_fast=True)


In [None]:
#Converting labels to numbers
def label_to_int(label):
  if label   == 'sadness':
    return 0
  elif label == 'joy':
    return 1
  elif label == 'anger':
    return 2
  elif label == 'surprise':
    return 3
  elif label == 'disgust':
    return 4
  elif label == 'fear':
    return 5
  elif label == 'others':
    return 6

In [None]:
train = pd.read_csv('data/EmoEvalEs/train.tsv', sep='\t', quoting=csv.QUOTE_NONE)
dev = pd.read_csv('data/EmoEvalEs/dev.tsv', sep='\t', quoting=csv.QUOTE_NONE)
test = pd.read_csv('data/EmoEvalEs/emoevales_test.tsv', sep='\t', quoting=csv.QUOTE_NONE)

# Dataset Exploration

In [None]:
train.info()

In [None]:
train.emotion.value_counts().plot(kind='bar')

## Offensive

In [None]:
train[train.offensive=='OFF'].emotion.value_counts().plot()

## Event

In [None]:
for event in set(train.event.to_list()):
    print(event)
    train[train.event==event].emotion.value_counts().plot()
    plt.show()

## Sentiment

In [None]:
from transformers import pipeline
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=tokenizer)

In [None]:
result = sentiment_task(train.tweet.tolist())

In [None]:
sentiment = []
for r in result:
    sentiment.append(r['label'])

In [None]:
train['sentiment'] = sentiment
train.head()

In [None]:
for sent in set(train.sentiment.to_list()):
    print(sent)
    train[train.sentiment==sent].emotion.value_counts().plot()
    plt.show()

# Dataset Processing

In [None]:
train = pd.read_csv('data/EmoEvalEs/train.tsv', sep='\t', quoting=csv.QUOTE_NONE)
dev = pd.read_csv('data/EmoEvalEs/dev.tsv', sep='\t', quoting=csv.QUOTE_NONE)
test = pd.read_csv('data/EmoEvalEs/emoevales_test.tsv', sep='\t', quoting=csv.QUOTE_NONE)

In [None]:
train = pd.get_dummies(train, columns = ['event'])
dev = pd.get_dummies(dev, columns = ['event'])
test = pd.get_dummies(test, columns = ['event'])

In [None]:
train.offensive = pd.Categorical(train.offensive)
train['offensive'] = train.offensive.cat.codes
dev.offensive = pd.Categorical(dev.offensive)
dev['offensive'] = dev.offensive.cat.codes
test.offensive = pd.Categorical(test.offensive)
test['offensive'] = test.offensive.cat.codes

In [None]:
train['label'] = train.emotion.apply(label_to_int)
dev['label'] = dev.emotion.apply(label_to_int)
test['label'] = 1
train.drop(columns=['emotion'],inplace=True)
dev.drop(columns=['emotion'], inplace=True)


In [None]:
for dataset in [train,test,dev]:
    result = sentiment_task(dataset.tweet.tolist())
    sentiment = []
    for r in result:
        sentiment.append(r['label'])
    dataset['sentiment'] = sentiment


In [None]:
train = pd.get_dummies(train, columns = ['sentiment'])
dev = pd.get_dummies(dev, columns = ['sentiment'])
test = pd.get_dummies(test, columns = ['sentiment'])

In [None]:
train.head()

In [None]:
from gsitk.preprocess import pprocess_twitter, Preprocessor

train['tweet'] = Preprocessor(pprocess_twitter).transform(train.tweet)
dev['tweet'] = Preprocessor(pprocess_twitter).transform(dev.tweet)
test['tweet'] = Preprocessor(pprocess_twitter).transform(test.tweet)

In [None]:
train.head()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train.corr(), annot=True)
plt.show()

In [None]:
train.to_csv('preprocessed_train.csv', index=False)
test.to_csv('preprocessed_test.csv', index=False)
dev.to_csv('preprocessed_dev.csv', index=False)

# Modeling

In [None]:
train = pd.read_csv('preprocessed_train.csv')
test = pd.read_csv('preprocessed_test.csv')
dev = pd.read_csv('preprocessed_dev.csv')

## Dataset

In [None]:
#Creates a dataset which will be used to feed to RoBERTa
class EmotionDataset(data.Dataset):

  def __init__(self, id, tweet, labelValue, extrafeats, tokenizer, max_len):
    self.tweet    = tweet      #First input sequence that will be supplied to RoBERTa
    self.id = id
    self.extra_feats   = extrafeats     #Extra features
    self.labelValue  = labelValue    #label value for each training example in the dataset
    self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
    self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to

  def __len__(self):
    return len(self.labelValue)

  def __getitem__(self, item):
    tweet    = str(self.tweet[item])

    
    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
    encoding = tokenizer.encode_plus(
        tweet,
        max_length = self.max_len,
        add_special_tokens= True,
        truncation = True,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'tweet' : tweet,
        'tweet_id': self.id[item],
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels'  : torch.tensor(self.labelValue[item], dtype=torch.long),
        'extra_features' : torch.tensor(self.extra_feats[item]).float()
    }


In [None]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
  ds = EmotionDataset(
      tweet    = dataframe.tweet.to_numpy(),
      labelValue  = dataframe.label.to_numpy(),
      extrafeats = dataframe.drop(columns=['tweet','label','id']).to_numpy(),
      tokenizer   = tokenizer,
      max_len     = max_len,
      id = dataframe.id.to_numpy()
  )

  return data.DataLoader(
      ds,
      batch_size  = batch_size,
      shuffle     = False,
      num_workers = 4
  )

#Creating data loader for test data
trainDataLoader         = createDataLoader(train, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
devDataLoader         = createDataLoader(dev, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(test, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [None]:
#Creating data loader for training data
trainDataset        = EmotionDataset(train.id.to_numpy(), train.tweet.to_numpy(), train.label.to_numpy(),
                                     train.drop(columns=['tweet','label','id']).to_numpy(), tokenizer, MAX_LENGTH)

#Creating data loader for development data
developmentDataset  = EmotionDataset(dev.id.to_numpy(),dev.tweet.to_numpy(), dev.label.to_numpy(),
                                     dev.drop(columns=['tweet','label','id']).to_numpy(), tokenizer, MAX_LENGTH)

#Creating data loader for test data
testDataset         = EmotionDataset(test.id.to_numpy(),test.tweet.to_numpy(), test.label.to_numpy(),
                                     test.drop(columns=['tweet','label','id']).to_numpy(), tokenizer, MAX_LENGTH)

In [None]:
from transformers import Trainer
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits,labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted',zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./resultsEmotion-extrafeats',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logsEmotion-extrafeats',                     # directory for storing logs
    logging_steps=10,                         # when to print log
    load_best_model_at_end=True,
    evaluation_strategy = 'epoch'
)

num_labels = len(set(train.label.tolist()))
print(f'Num labels: {num_labels}')

In [None]:
# Based on HuggingFace implementation: https://huggingface.co/transformers/_modules/transformers/models/roberta/modeling_roberta.html#RobertaForSequenceClassification

class XLMRobertaForSequenceClassificationCustom(XLMRobertaForSequenceClassification):
    def __init__(self, config):
        super(XLMRobertaForSequenceClassification,self).__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        extra_features=None
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output, extra_features)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.combined_feats = config.hidden_size + train.drop(columns=['tweet','label','id']).shape[1]
        self.dense = nn.Linear(self.combined_feats, self.combined_feats)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(self.combined_feats, config.num_labels)

    def forward(self, features, extra_features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = torch.cat((x, extra_features) , dim=1)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [None]:
model = XLMRobertaForSequenceClassificationCustom.from_pretrained(MODEL, num_labels=num_labels)

In [None]:
trainer = MultilabelTrainer(
    model=model,                              # the instantiated 🤗 Transformers model to be trained
    args=training_args,                       # training arguments, defined above
    train_dataset=trainDataset,              # training dataset
    eval_dataset=developmentDataset,                 # evaluation dataset
    compute_metrics=compute_metrics 
)

trainer.train()

In [None]:
trainer.save_model("./resultsEmotion-extrafeats/best_model") # save best model

In [None]:
trainer.evaluate()

In [None]:
# Las métricas son bastante malas porque el test dataset tiene todas las labels a 1
#para que entre en el trainer sin tener que hacer clases especiales
pred = trainer.predict(testDataset)
pred

In [None]:
result = np.argmax(pred.predictions, axis=1)

In [None]:
# import pandas as pd
df = pd.DataFrame(testDataset[:]['tweet_id'], columns=['id'])

In [None]:
df['emotion'] = result
df

In [None]:
#Converting labels to numbers
def int_to_label(label):
  if label   == 0:
    return 'sadness'
  elif label == 1:
    return 'joy'
  elif label == 2:
    return 'anger'
  elif label == 3:
    return 'surprise'
  elif label == 4:
    return 'disgust'
  elif label == 5:
    return 'fear'
  elif label == 6:
    return 'others'

In [None]:
df['emotion'] = df.emotion.apply(int_to_label)
df.head()

In [None]:
df.emotion.value_counts()

In [None]:
df.to_csv('submission-extra.tsv', header =False, sep = '\t',index=False)

Probando el modelo

In [None]:
m = XLMRobertaForSequenceClassificationCustom.from_pretrained("./resultsEmotion-extrafeats/best_model", num_labels=num_labels)

In [None]:
device = torch.device("cuda")
m = m.to(device)
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

  model = model.eval()

  predictions = []
  prediction_probs = []
  real_values = []
  ids = []

  with torch.no_grad():
    for d in tqdm(data_loader):


      input_ids              = d["input_ids"].to(device)
      attention_mask         = d["attention_mask"].to(device)
      labels                 = d["labels"].to(device)

      #Getting the softmax output from model
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask
      )

      _, preds = torch.max(outputs.logits, dim=1)     #Determining the model predictions


      predictions.extend(preds)
      prediction_probs.extend(outputs.logits)
      real_values.extend(labels)
      ids.extend(d['tweet_id'])

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  
  return ids, predictions, prediction_probs, real_values

In [None]:
#Getting model predictions on dev dataset
ids_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  m,
  devDataLoader
)

In [None]:
  #Printing classification report for dev dataset (Evaluating the model on Dev set)
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES))

In [None]:
#Getting model predictions on dev dataset
ids_test, yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader
)

In [None]:
dev_preds = pd.DataFrame({'id': ids_dev, 'emotion': pd.Series(yHat_dev).apply(int_to_label)})
dev_preds

In [None]:
test_preds = pd.DataFrame({'id': ids_test, 'emotion': pd.Series(yHat_test).apply(int_to_label)})
test_preds

In [None]:
import pickle

def save_preds(obj, name, fold):
    path = 'preds_{}/{}.pck'.format(fold, name)
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

In [None]:
# save_preds(dev_preds, 'xlmroberta', 'dev')
save_preds(df, 'xlmroberta-extrafeatures', 'test')

In [None]:
test_preds.to_csv('submission-roberta-extra-final.tsv',header =False, sep = '\t',index=False)