Attempt to finetune DistilBERT-base-uncased for Arxiv papers multiclassification:
- Prepare dataset
- Tokenize
- Load model
- Define metrics
- Define Trainer object and TrainingArguments
- Evaluate predictions
- Error analysis


In [1]:
import pandas as pd
import numpy as np
from datasets import load_from_disk

all_stream_data = load_from_disk("data/processed/all_stream_data")


In [2]:
print(all_stream_data)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 72342
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1925
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1926
    })
})


In [3]:
from transformers import AutoTokenizer
import torch

model_id = "distilbert-base-cased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [4]:
train_dataset = all_stream_data["train"].map(tokenize, batched=True, batch_size=None)

eval_dataset = all_stream_data["validation"].map(tokenize, batched=True, batch_size=None)


Map:   0%|          | 0/72342 [00:00<?, ? examples/s]

Map:   0%|          | 0/1925 [00:00<?, ? examples/s]

In [5]:
labels = all_stream_data["train"].features["label"].names
y_valid = np.array(eval_dataset["label"])


In [6]:
from transformers import AutoModelForSequenceClassification

num_labels = len(labels)
model = (AutoModelForSequenceClassification
        .from_pretrained(model_id, num_labels=num_labels)
        .to(device))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
# from huggingface_hub import notebook_login
# notebook_login()


In [8]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import balanced_accuracy_score, f1_score

batch_size = 64
num_train_epochs = 4
learning_rate = 2e-5
logging_steps = len(train_dataset) // batch_size
model_name = f"./models/{model_id}-finetuned-arxiv-test"
training_args = TrainingArguments(output_dir=model_name,
                                  overwrite_output_dir=True,
                                  num_train_epochs=num_train_epochs,
                                  learning_rate=learning_rate,per_device_train_batch_size=batch_size,per_device_eval_batch_size=batch_size * 2,
                                  weight_decay=0.01,
                                  warmup_steps=500,
                                  fp16=True,
                                  eval_strategy="steps",
                                  eval_steps=500,
                                  save_strategy="steps",
                                  save_total_limit=3,
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="macro_f1",
                                  greater_is_better=True,
                                  push_to_hub=True,
                                  label_smoothing_factor=0.1,
                                  log_level="error")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    macro_f1 = f1_score(labels, preds, average="macro")
    balanced_accuracy = balanced_accuracy_score(labels, preds)
    return {"macro_f1": macro_f1, "balanced_accuracy": balanced_accuracy}


In [9]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
train_labels = np.array(train_dataset["label"])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)

# Convert to PyTorch tensor and move to the right device
class_weights = torch.tensor(class_weights, dtype=torch.float).to(model.device)

class CustomTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Apply class weights to the loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss


In [10]:
trainer = CustomTrainer(
    class_weights=class_weights,
    model=model, 
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=20)]
)

trainer.train()


Step,Training Loss,Validation Loss,Macro F1,Balanced Accuracy
500,No log,1.176011,0.51325,0.620671
1000,No log,0.932846,0.574507,0.712512
1500,1.421100,0.855133,0.619049,0.714774
2000,1.421100,0.784305,0.624548,0.737193
2500,0.687700,0.692734,0.638032,0.75908
3000,0.687700,0.726968,0.644424,0.756251
3500,0.493000,0.778429,0.664025,0.761069
4000,0.493000,0.801433,0.660032,0.750493
4500,0.493000,0.773257,0.66519,0.754479


'(MaxRetryError('HTTPSConnectionPool(host=\'hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com\', port=443): Max retries exceeded with url: /repos/7c/89/7c89986a61e4f907247cf07431b2ffe05b3e4ad7d7823cdb09285b0d5259e74b/b7973a587e85de85e96e15b46dae0d7442ff11904b061a6c7bcee57c5bd1a05c?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250522%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250522T040709Z&X-Amz-Expires=86400&X-Amz-Signature=e4086b171e9d137a904928eba73c2a0d09086a92192c357fce53ac5eb7d4a224&X-Amz-SignedHeaders=host&partNumber=1&uploadId=FFjh8a5BC.DSE5IlHbmJDJqo2IP9SB6n5GzdgOhJK3d8bIeK0GWyKoPo8Mhx91S0gf0AlvNAB3RWYQdB3vei4frMU.cf3UOyoBLH.KBPFBgBghKxVUu1tX9VdKG.A9VK&x-id=UploadPart (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f92c1800a50>: Failed to resolve \'hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: e6ae884d-d

TrainOutput(global_step=4524, training_loss=0.7462802370969012, metrics={'train_runtime': 1228.567, 'train_samples_per_second': 235.533, 'train_steps_per_second': 3.682, 'total_flos': 3.834139652613734e+16, 'train_loss': 0.7462802370969012, 'epoch': 4.0})

In [None]:
preds_output = trainer.predict(eval_dataset)


In [None]:
preds_output.metrics


In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)


In [None]:
from src.utils import plot_confusion_matrix

plot_confusion_matrix(y_preds, y_valid, labels)


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_valid, y_preds, target_names=labels))


In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    # Place all input tensors on the same device as the model
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device),
                         reduction="none")
    # Place outputs on CPU for compatibility with other dataset columns
    return {"loss": loss.cpu().numpy(),
            "predicted_label": pred_label.cpu().numpy()}


In [None]:
# Convert our dataset back to PyTorch tensors
data_encoded.set_format("torch",
                        columns=["input_ids", "attention_mask", "label"])
# Compute loss values
data_encoded["validation"] = data_encoded["validation"].map(
    forward_pass_with_label, batched=True, batch_size=16)


In [None]:
def label_int2str(row):
    return data["train"].features["label"].int2str(row)


In [None]:
data_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = data_encoded["validation"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"].apply(label_int2str))


In [None]:
df_test.sort_values("loss", ascending=False).head(10)


In [None]:
df_test.sort_values("loss", ascending=True).head(10)
