In [1]:
#Importing necessary libraries

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy
import csv

import transformers
from transformers                     import  AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

import torch

from tqdm import tqdm


from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.model_selection import train_test_split


from torch                            import nn, optim
from torch.utils                      import data

#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False


CLASS_NAMES = ['sadness', 'joy', 'anger', 'surprise', 'disgust', 'fear', 'others']
MAX_LENGTH = 200
BATCH_SIZE = 16
EPOCHS = 5
MODEL = "cardiffnlp/twitter-xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL,use_fast=True)


In [2]:
#Converting labels to numbers
def label_to_int(label):
  if label   == 'sadness':
    return 0
  elif label == 'joy':
    return 1
  elif label == 'anger':
    return 2
  elif label == 'surprise':
    return 3
  elif label == 'disgust':
    return 4
  elif label == 'fear':
    return 5
  elif label == 'others':
    return 6

In [3]:
train = pd.read_csv('data/EmoEvalEs/train.tsv', sep='\t', quoting=csv.QUOTE_NONE)
dev = pd.read_csv('data/EmoEvalEs/dev.tsv', sep='\t', quoting=csv.QUOTE_NONE)
test = pd.read_csv('data/EmoEvalEs/emoevales_test.tsv', sep='\t', quoting=csv.QUOTE_NONE)

In [4]:
test_ids = test.id.copy()
train['label'] = train.emotion.apply(label_to_int)
dev['label'] = dev.emotion.apply(label_to_int)
test['label'] = 1
train.drop(columns=['emotion','event','offensive'], inplace=True)
dev.drop(columns=['emotion','event','offensive'], inplace=True)
test.drop(columns=['event','offensive'], inplace=True)

In [5]:
train.head()

Unnamed: 0,id,tweet,label
0,a0c1a858-a9b8-4cb1-8a81-1602736ff5b8,La Gran Guerra de #JuegoDeTronos nos ha dejado...,6
1,9b272817-a231-4f68-bdf4-3350d4919330,El golpe de Estado en #Venezuela está más lejo...,6
2,4bd5b1e5-4b74-440a-82f4-c2567a241011,No tengo una foto en la catedral de #NotreDame...,0
3,0bb9d7c9-d781-4684-890e-a94bfb50acc0,#NotreDame nunca llegue a visitar tan grandios...,0
4,88749098-d539-4500-9209-0bbfae2b109c,A tomar por culo mi crush 😭😭😭😭😭#JuegoDeTronos,0


In [6]:
from gsitk.preprocess import pprocess_twitter, Preprocessor

train['tweet'] = Preprocessor(pprocess_twitter).transform(train.tweet)
dev['tweet'] = Preprocessor(pprocess_twitter).transform(dev.tweet)
test['tweet'] = Preprocessor(pprocess_twitter).transform(test.tweet)

In [7]:
train.head()

Unnamed: 0,id,tweet,label
0,a0c1a858-a9b8-4cb1-8a81-1602736ff5b8,la gran guerra de <hastag> juegodetronos nos h...,6
1,9b272817-a231-4f68-bdf4-3350d4919330,el golpe de estado en <hastag> venezuela está ...,6
2,4bd5b1e5-4b74-440a-82f4-c2567a241011,no tengo una foto en la catedral de <hastag> n...,0
3,0bb9d7c9-d781-4684-890e-a94bfb50acc0,<hastag> notredame nunca llegue a visitar tan ...,0
4,88749098-d539-4500-9209-0bbfae2b109c,a tomar por culo mi crush 😭😭😭😭😭<hastag> juegod...,0


In [8]:
#Creates a dataset which will be used to feed to RoBERTa
class EmotionDataset(data.Dataset):

  def __init__(self, id, tweet, labelValue,  tokenizer, max_len):
    self.tweet    = tweet      #First input sequence that will be supplied to RoBERTa
    self.id = id
#     self.extra_feats   = extrafeats     #Extra features
    self.labelValue  = labelValue    #label value for each training example in the dataset
    self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
    self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to

  def __len__(self):
    return len(self.labelValue)

  def __getitem__(self, item):
    tweet    = str(self.tweet[item])

    
    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
    encoding = tokenizer.encode_plus(
        tweet,
        max_length = self.max_len,
        add_special_tokens= True,
        truncation = True,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'tweet' : tweet,
        'tweet_id': self.id[item],
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels'  : torch.tensor(self.labelValue[item], dtype=torch.long)
#         'extra_features' : torch.tensor(self.extra_feats[item]).float()
    }


In [9]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
  ds = EmotionDataset(
      tweet    = dataframe.tweet.to_numpy(),
      labelValue  = dataframe.label.to_numpy(),
      tokenizer   = tokenizer,
      max_len     = max_len,
      id = dataframe.id.to_numpy()
  )

  return data.DataLoader(
      ds,
      batch_size  = batch_size,
      shuffle     = False,
      num_workers = 4
  )

#Creating data loader for test data
devDataLoader         = createDataLoader(dev, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(test, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [10]:
#Creating data loader for training data
trainDataset        = EmotionDataset(train.id.to_numpy(), train.tweet.to_numpy(), train.label.to_numpy(), tokenizer, MAX_LENGTH)

#Creating data loader for development data
developmentDataset  = EmotionDataset(dev.id.to_numpy(),dev.tweet.to_numpy(), dev.label.to_numpy(), tokenizer, MAX_LENGTH)

#Creating data loader for test data
testDataset         = EmotionDataset(test.id.to_numpy(),test.tweet.to_numpy(), test.label.to_numpy(), tokenizer, MAX_LENGTH)

In [11]:
from transformers import Trainer
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits,labels)
        return (loss, outputs) if return_outputs else loss

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted',zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
training_args = TrainingArguments(
    output_dir='./resultsEmotion',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logsEmotion',                     # directory for storing logs
    logging_steps=10,                         # when to print log
    load_best_model_at_end=True,
    evaluation_strategy = 'epoch'
)

num_labels = len(set(train.label.tolist()))
print(f'Num labels: {num_labels}')

Num labels: 7


In [14]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.den

In [15]:
trainer = MultilabelTrainer(
    model=model,                              # the instantiated 🤗 Transformers model to be trained
    args=training_args,                       # training arguments, defined above
    train_dataset=trainDataset,              # training dataset
    eval_dataset=developmentDataset,                 # evaluation dataset
    compute_metrics=compute_metrics 
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.8849,0.880064,0.703791,0.671448,0.709256,0.703791,5.3969,156.386
2,0.7404,0.787941,0.716825,0.698612,0.694039,0.716825,4.7785,176.626
3,0.5623,0.852137,0.725118,0.712473,0.70738,0.725118,4.9481,170.571
4,0.2534,0.99192,0.714455,0.703672,0.701566,0.714455,5.1384,164.253
5,0.1496,1.093736,0.7109,0.704091,0.704652,0.7109,5.0759,166.277


TrainOutput(global_step=895, training_loss=0.6287310708168499, metrics={'train_runtime': 776.3164, 'train_samples_per_second': 1.153, 'total_flos': 9547647626478000.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 1961680896, 'init_mem_gpu_alloc_delta': 1112270848, 'init_mem_cpu_peaked_delta': 765292544, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 918429696, 'train_mem_gpu_alloc_delta': 4452847104, 'train_mem_cpu_peaked_delta': 1802530816, 'train_mem_gpu_peaked_delta': 2129602048})

In [16]:
trainer.save_model("./resultsEmotion/best_model") # save best model

In [17]:
trainer.evaluate()

{'eval_loss': 0.7879405617713928,
 'eval_accuracy': 0.716824644549763,
 'eval_f1': 0.6986115901816552,
 'eval_precision': 0.6940393466885716,
 'eval_recall': 0.716824644549763,
 'eval_runtime': 5.1671,
 'eval_samples_per_second': 163.34,
 'epoch': 5.0,
 'eval_mem_cpu_alloc_delta': 704512,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 65536,
 'eval_mem_gpu_peaked_delta': 131904000}

In [18]:
# Las métricas son bastante malas porque el test dataset tiene todas las labels a 1
#para que entre en el trainer sin tener que hacer clases especiales
pred = trainer.predict(testDataset)
pred

PredictionOutput(predictions=array([[ 0.44199166, -0.27039844,  1.5419297 , ..., -0.36946657,
        -1.8340789 ,  1.6222807 ],
       [-1.2770096 ,  0.61582726, -0.08355317, ..., -1.6234953 ,
        -1.3008451 ,  3.5176861 ],
       [-0.8273943 ,  3.1069672 , -1.847659  , ..., -2.4050257 ,
        -1.8779653 ,  3.887411  ],
       ...,
       [-0.07271109,  1.9239235 , -1.3796693 , ..., -2.2986069 ,
        -1.9148648 ,  4.2209215 ],
       [-0.68020356,  1.5305645 , -1.1030644 , ..., -2.0653632 ,
        -1.8271672 ,  4.582106  ],
       [-0.07500023, -0.585519  , -0.27193904, ..., -1.6220309 ,
        -0.8767815 ,  4.030512  ]], dtype=float32), label_ids=array([1, 1, 1, ..., 1, 1, 1]), metrics={'test_loss': 2.735836982727051, 'test_accuracy': 0.21497584541062803, 'test_f1': 0.3538767395626243, 'test_precision': 1.0, 'test_recall': 0.21497584541062803, 'test_runtime': 9.8011, 'test_samples_per_second': 168.961, 'test_mem_cpu_alloc_delta': 278528, 'test_mem_gpu_alloc_delta': 0, 'tes

In [19]:
result = np.argmax(pred.predictions, axis=1)

In [20]:
import pandas as pd
df = pd.DataFrame(test_ids, columns=['id'])

In [21]:
df['emotion'] = result
df

Unnamed: 0,id,emotion
0,16b25dfb-e284-4a58-b62c-8186fc082eb6,6
1,2a80f6bf-4750-4783-9bc8-fdb8ff2b94c4,6
2,1f477a6f-3559-41ee-8ec5-2e77aee53190,6
3,838add70-748c-4635-8133-36ff0b05aeb0,1
4,64e3dd59-ae5c-4b9b-bd29-987609eb95d8,1
...,...,...
1651,7366276a-df9f-48c9-983c-a6a9d28bfdb6,1
1652,c005c503-5957-4331-9f84-3ba92d9bc091,6
1653,256c78de-a052-4779-b71a-63db362d3ffd,6
1654,b9bcc987-4910-4669-8cf2-7c6215bddc0b,6


In [19]:
#Converting labels to numbers
def int_to_label(label):
  if label   == 0:
    return 'sadness'
  elif label == 1:
    return 'joy'
  elif label == 2:
    return 'anger'
  elif label == 3:
    return 'surprise'
  elif label == 4:
    return 'disgust'
  elif label == 5:
    return 'fear'
  elif label == 6:
    return 'others'

In [23]:
df['emotion'] = df.emotion.apply(int_to_label)
df.head()

Unnamed: 0,id,emotion
0,16b25dfb-e284-4a58-b62c-8186fc082eb6,others
1,2a80f6bf-4750-4783-9bc8-fdb8ff2b94c4,others
2,1f477a6f-3559-41ee-8ec5-2e77aee53190,others
3,838add70-748c-4635-8133-36ff0b05aeb0,joy
4,64e3dd59-ae5c-4b9b-bd29-987609eb95d8,joy


In [24]:
df.emotion.value_counts()

others      796
joy         379
anger       224
sadness     205
surprise     36
fear         16
Name: emotion, dtype: int64

In [25]:
# df.to_csv('submission-preproc.csv', index=False)

Probando el modelo

In [12]:
m = AutoModelForSequenceClassification.from_pretrained("./resultsEmotion/best_model", num_labels=num_labels)

In [13]:
device = torch.device("cuda")
m = m.to(device)
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

  model = model.eval()

  predictions = []
  prediction_probs = []
  real_values = []
  ids = []

  with torch.no_grad():
    for d in tqdm(data_loader):


      input_ids              = d["input_ids"].to(device)
      attention_mask         = d["attention_mask"].to(device)
      labels                 = d["labels"].to(device)

      #Getting the softmax output from model
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask
      )

      _, preds = torch.max(outputs.logits, dim=1)     #Determining the model predictions


      predictions.extend(preds)
      prediction_probs.extend(outputs.logits)
      real_values.extend(labels)
      ids.extend(d['tweet_id'])

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  
  return ids, predictions, prediction_probs, real_values

In [14]:
#Getting model predictions on dev dataset
ids_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  m,
  devDataLoader
)

100%|██████████| 53/53 [00:04<00:00, 13.11it/s]


In [15]:
  #Printing classification report for dev dataset (Evaluating the model on Dev set)
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES, digits=4))

              precision    recall  f1-score   support

     sadness     0.7961    0.7885    0.7923       104
         joy     0.6611    0.6575    0.6593       181
       anger     0.6707    0.6471    0.6587        85
    surprise     0.4286    0.0857    0.1429        35
     disgust     0.0000    0.0000    0.0000        16
        fear     1.0000    0.5556    0.7143         9
      others     0.7302    0.8237    0.7741       414

    accuracy                         0.7168       844
   macro avg     0.6124    0.5083    0.5345       844
weighted avg     0.6940    0.7168    0.6986       844



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
#Getting model predictions on dev dataset
ids_test, yHat_test, predProbs_test, yTest_test = get_predictions(
  m,
  testDataLoader
)

100%|██████████| 104/104 [00:08<00:00, 12.90it/s]


In [31]:
dev_preds = pd.DataFrame({'id': ids_dev, 'emotion': pd.Series(yHat_dev).apply(int_to_label)})
dev_preds

Unnamed: 0,id,emotion
0,d23cfa8a-dad1-45b6-90eb-b786cd21e7d3,sadness
1,5192574e-af5e-4ccb-aa1d-b801a9395b7f,sadness
2,86a2042d-4964-4e07-a02b-aa2953a86ced,anger
3,067c0c3e-459e-4b36-8223-22d8ce7f9cd9,joy
4,5243fe33-bcea-4300-8f2e-b79e63557673,others
...,...,...
839,58a876f1-b866-4555-88d9-8bd1bf237074,others
840,74590933-6e77-414e-b06c-6c51eff61452,joy
841,6c7c0d00-7834-494c-be41-581883c6d241,sadness
842,2f06b007-bfbf-45f7-b720-e8d19e9f2751,anger


In [32]:
test_preds = pd.DataFrame({'id': ids_test, 'emotion': pd.Series(yHat_test).apply(int_to_label)})
test_preds

Unnamed: 0,id,emotion
0,16b25dfb-e284-4a58-b62c-8186fc082eb6,others
1,2a80f6bf-4750-4783-9bc8-fdb8ff2b94c4,others
2,1f477a6f-3559-41ee-8ec5-2e77aee53190,others
3,838add70-748c-4635-8133-36ff0b05aeb0,joy
4,64e3dd59-ae5c-4b9b-bd29-987609eb95d8,joy
...,...,...
1651,7366276a-df9f-48c9-983c-a6a9d28bfdb6,joy
1652,c005c503-5957-4331-9f84-3ba92d9bc091,others
1653,256c78de-a052-4779-b71a-63db362d3ffd,others
1654,b9bcc987-4910-4669-8cf2-7c6215bddc0b,others


In [33]:
import pickle

def save_preds(obj, name, fold):
    path = 'preds_{}/{}.pck'.format(fold, name)
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

In [34]:
save_preds(dev_preds, 'xlmroberta', 'dev')
save_preds(test_preds, 'xlmroberta', 'test')

In [35]:
test_preds.to_csv('submission-roberta-final.tsv',header =False, sep = '\t',index=False)

In [36]:
df

Unnamed: 0,id,emotion
0,16b25dfb-e284-4a58-b62c-8186fc082eb6,others
1,2a80f6bf-4750-4783-9bc8-fdb8ff2b94c4,others
2,1f477a6f-3559-41ee-8ec5-2e77aee53190,others
3,838add70-748c-4635-8133-36ff0b05aeb0,joy
4,64e3dd59-ae5c-4b9b-bd29-987609eb95d8,joy
...,...,...
1651,7366276a-df9f-48c9-983c-a6a9d28bfdb6,joy
1652,c005c503-5957-4331-9f84-3ba92d9bc091,others
1653,256c78de-a052-4779-b71a-63db362d3ffd,others
1654,b9bcc987-4910-4669-8cf2-7c6215bddc0b,others
