In [1]:
from datasets import load_dataset
dataset=load_dataset('go_emotions')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [None]:
import numpy as np

num_labels = 28

def encode_labels(example):
    label_vector = np.zeros(num_labels, dtype=np.float32)
    
    for label in example["labels"]:
        if label < num_labels:
            label_vector[label] = 1.0
            
    example["labels"] = label_vector
    return example

dataset = dataset.map(encode_labels)

In [4]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [5]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=27
    )

dataset = dataset.map(tokenize, batched=True)

Map: 100%|██████████| 5426/5426 [00:00<00:00, 6088.02 examples/s]


In [6]:
dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)


In [7]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)


Loading weights: 100%|██████████| 197/197 [00:00<00:00, 216.53it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Conside

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    probs = 1 / (1 + np.exp(-logits))
    
    
    predictions = (probs > 0.5).astype(int)
    
 
    labels = labels.astype(int)
    
    return {
        "f1_micro": f1_score(labels, predictions, average="micro"),
        "f1_macro": f1_score(labels, predictions, average="macro"),
        "accuracy": accuracy_score(labels, predictions)
    }

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta-goemotions",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    num_train_epochs=3,             
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    fp16=True,                      
    save_total_limit=2            
)

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [None]:
from transformers import DataCollatorWithPadding, Trainer
import torch

train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

class FloatLabelDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["labels"] = batch["labels"].float() 
        return batch

data_collator = FloatLabelDataCollator(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   
    eval_dataset=eval_dataset,      
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



In [None]:
trainer.train()

In [12]:
trainer=trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
2,0.129228,0.12646,0.271765,0.159418,0.164947
3,0.121993,0.124069,0.308783,0.174548,0.194987


Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.82s/it]
  super().__init__(loader)
Writing model shards: 100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.La

In [None]:

model.save_pretrained("roberta-goemotions-model-new")

#Save the tokenizer
tokenizer.save_pretrained("roberta-goemotions-model-new")

print("✅ SUCCESS: Model and Tokenizer saved to 'roberta-goemotions-model-new'")

Writing model shards: 100%|██████████| 1/1 [00:03<00:00,  3.99s/it]


✅ SUCCESS: Model and Tokenizer saved to 'roberta-goemotions-model-new'
