In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
dataset = load_dataset("csv", data_files={"train": "train.csv", "validation": "val.csv"})

In [3]:
# Create label mappings
label2id = {label: idx for idx, label in enumerate(dataset["train"].unique("label"))}
id2label = {idx: label for label, idx in label2id.items()}

In [4]:
# Map labels to IDs
def map_labels(example):
    example["label"] = label2id[example["label"]]
    return example

dataset = dataset.map(map_labels)

In [5]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./tinybert_model")

In [7]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["Description"],
        padding=True,          # Dynamic padding (handled by DataCollator)
        truncation=True,       # Truncate to model's max length
        max_length=128         # Set max_length to 128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/272 [00:00<?, ? examples/s]

Map: 100%|██████████| 272/272 [00:00<00:00, 1448.46 examples/s]
Map: 100%|██████████| 68/68 [00:00<00:00, 762.45 examples/s]


In [8]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    "./tinybert_model",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./tinybert_model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Define a custom compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1_score": f1}


In [10]:
# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",          # Evaluate every epoch
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=16,       # Batch size for training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    num_train_epochs=3,                   # Number of epochs
    weight_decay=0.01,                    # Weight decay
    save_strategy="epoch",                # Save model every epoch
    logging_dir="./logs",                 # Directory for logs
    logging_steps=10,                     # Log every 10 steps
    load_best_model_at_end=True,          # Load the best model at the end
    metric_for_best_model="accuracy",     # Use accuracy to select the best model
    greater_is_better=True,               # Higher accuracy is better
    save_total_limit=2,                   # Keep only the last 2 checkpoints
    fp16=True,                            # Use mixed precision for faster training
)



In [12]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,          # Use data collator for dynamic padding
    compute_metrics=compute_metrics,      # Custom metrics function
)

  trainer = Trainer(


In [13]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,3.5278,3.528162,0.014706,0.001337
2,3.5247,3.52943,0.014706,0.003529
3,3.5219,3.529618,0.0,0.0


TrainOutput(global_step=51, training_loss=3.525196678498212, metrics={'train_runtime': 195.1011, 'train_samples_per_second': 4.182, 'train_steps_per_second': 0.261, 'total_flos': 2931432136704.0, 'train_loss': 3.525196678498212, 'epoch': 3.0})

In [14]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_tinybert")
tokenizer.save_pretrained("./fine_tuned_tinybert")

print("Model fine-tuning complete!")

Model fine-tuning complete!
