In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pprint import pprint

from datasets import load_dataset
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


## Load dataset

In [3]:
dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

Reusing dataset sem_eval2018_task1 (/home/misha/.cache/huggingface/datasets/sem_eval2018_task1/subtask5.english/1.1.0/c8af6e4accd23f95ac75d14477b0678a4f59d5da34e480ad7de8112ffab04a3d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [5]:
dataset['train'][0]

{'ID': '2017-En-21441',
 'Tweet': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'anger': False,
 'anticipation': True,
 'disgust': False,
 'fear': False,
 'joy': False,
 'love': False,
 'optimism': True,
 'pessimism': False,
 'sadness': False,
 'surprise': False,
 'trust': True}

In [6]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [6]:
print(tokenizer.mask_token_id, tokenizer.mask_token)
print(tokenizer.pad_token_id, tokenizer.pad_token)
print(tokenizer.cls_token_id, tokenizer.cls_token)

103 [MASK]
0 [PAD]
101 [CLS]


In [8]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)



  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
example = encoded_dataset['train'][0]
print(list(example.keys()))

['input_ids', 'token_type_ids', 'attention_mask', 'labels']


In [10]:
tokenizer.decode(example['input_ids'])

"[CLS] “ worry is a down payment on a problem you may never have '. joyce meyer. # motivation # leadership # worry [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [11]:
dataset['train'][0]['Tweet']

"“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry"

In [12]:
example['labels']

[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]

In [13]:
encoded_dataset.set_format("torch")

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    problem_type="multi_label_classification", 
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Train the model

In [16]:
batch_size = 8
metric_name = "f1"

In [17]:

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)



In [18]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [21]:
np.object = object
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [22]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  1523,  4737,  2003,  1037,  2091,  7909,  2006,  1037,  3291,
         2017,  2089,  2196,  2031,  1005,  1012, 11830, 11527,  1012,  1001,
        14354,  1001,  4105,  1001,  4737,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [23]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.7382, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.3282,  0.1461, -0.1504,  0.5215, -0.0687, -0.1635, -0.0760, -0.1682,
         -0.1015,  0.4104, -0.2692]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [25]:
trainer.evaluate()

  0%|          | 0/111 [00:00<?, ?it/s]

{'eval_loss': 0.7038481831550598,
 'eval_f1': 0.19625811378388697,
 'eval_roc_auc': 0.4499745442443607,
 'eval_accuracy': 0.0,
 'eval_runtime': 5.0334,
 'eval_samples_per_second': 176.026,
 'eval_steps_per_second': 22.053}

In [26]:
trainer.train()

  0%|          | 0/4275 [00:00<?, ?it/s]

{'loss': 0.41, 'grad_norm': 2.0944104194641113, 'learning_rate': 1.7660818713450293e-05, 'epoch': 0.58}


  0%|          | 0/111 [00:00<?, ?it/s]

{'eval_loss': 0.31540846824645996, 'eval_f1': 0.6817364500385307, 'eval_roc_auc': 0.7803363515206497, 'eval_accuracy': 0.2832957110609481, 'eval_runtime': 4.6984, 'eval_samples_per_second': 188.574, 'eval_steps_per_second': 23.625, 'epoch': 1.0}
{'loss': 0.3191, 'grad_norm': 4.504338264465332, 'learning_rate': 1.5321637426900587e-05, 'epoch': 1.17}
{'loss': 0.2813, 'grad_norm': 4.2004265785217285, 'learning_rate': 1.2982456140350879e-05, 'epoch': 1.75}


  0%|          | 0/111 [00:00<?, ?it/s]

{'eval_loss': 0.30168259143829346, 'eval_f1': 0.6946663231125998, 'eval_roc_auc': 0.7873705592365653, 'eval_accuracy': 0.27313769751693, 'eval_runtime': 4.6509, 'eval_samples_per_second': 190.502, 'eval_steps_per_second': 23.866, 'epoch': 2.0}
{'loss': 0.2575, 'grad_norm': 1.8976444005966187, 'learning_rate': 1.0643274853801172e-05, 'epoch': 2.34}
{'loss': 0.2399, 'grad_norm': 1.6256463527679443, 'learning_rate': 8.304093567251463e-06, 'epoch': 2.92}


  0%|          | 0/111 [00:00<?, ?it/s]

{'eval_loss': 0.3085322976112366, 'eval_f1': 0.7108167770419426, 'eval_roc_auc': 0.8044770100514392, 'eval_accuracy': 0.2799097065462754, 'eval_runtime': 4.7296, 'eval_samples_per_second': 187.332, 'eval_steps_per_second': 23.469, 'epoch': 3.0}
{'loss': 0.2125, 'grad_norm': 1.4554675817489624, 'learning_rate': 5.964912280701755e-06, 'epoch': 3.51}


  0%|          | 0/111 [00:00<?, ?it/s]

{'eval_loss': 0.3137083351612091, 'eval_f1': 0.706837186424004, 'eval_roc_auc': 0.8016345981086785, 'eval_accuracy': 0.26636568848758463, 'eval_runtime': 4.8025, 'eval_samples_per_second': 184.487, 'eval_steps_per_second': 23.113, 'epoch': 4.0}
{'loss': 0.2057, 'grad_norm': 1.8292967081069946, 'learning_rate': 3.625730994152047e-06, 'epoch': 4.09}
{'loss': 0.1904, 'grad_norm': 1.1751137971878052, 'learning_rate': 1.2865497076023392e-06, 'epoch': 4.68}


  0%|          | 0/111 [00:00<?, ?it/s]

{'eval_loss': 0.3134128451347351, 'eval_f1': 0.7098508922023955, 'eval_roc_auc': 0.804446017783339, 'eval_accuracy': 0.2742663656884876, 'eval_runtime': 4.8231, 'eval_samples_per_second': 183.7, 'eval_steps_per_second': 23.014, 'epoch': 5.0}
{'train_runtime': 611.5073, 'train_samples_per_second': 55.911, 'train_steps_per_second': 6.991, 'train_loss': 0.2594835319295961, 'epoch': 5.0}


TrainOutput(global_step=4275, training_loss=0.2594835319295961, metrics={'train_runtime': 611.5073, 'train_samples_per_second': 55.911, 'train_steps_per_second': 6.991, 'total_flos': 2249123476753920.0, 'train_loss': 0.2594835319295961, 'epoch': 5.0})

In [27]:
trainer.evaluate()

  0%|          | 0/111 [00:00<?, ?it/s]

{'eval_loss': 0.3085322976112366,
 'eval_f1': 0.7108167770419426,
 'eval_roc_auc': 0.8044770100514392,
 'eval_accuracy': 0.2799097065462754,
 'eval_runtime': 4.7868,
 'eval_samples_per_second': 185.094,
 'eval_steps_per_second': 23.189,
 'epoch': 5.0}

## Inference


In [36]:
text = "I'm happy I can finally train a model for multi-label classification"
text = "I'm devastated"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [37]:
torch.sigmoid(outputs.logits)

tensor([[0.1163, 0.0306, 0.2675, 0.2656, 0.0371, 0.0222, 0.0207, 0.5095, 0.9631,
         0.0353, 0.0081]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [35]:
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']