##### Imports

In [14]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import torch

##### Load the datasets

In [15]:
dataset = load_dataset("csv", data_files={"train": "train.csv", "validation": "val.csv"})

##### Map labels to integers

In [16]:
label2id = {label: idx for idx, label in enumerate(dataset["train"].unique("label"))}
id2label = {idx: label for label, idx in label2id.items()}

def map_labels(example):
    example["label"] = label2id[example["label"]]
    return example

dataset = dataset.map(map_labels)

##### Load TinyBERT tokenizer

In [17]:
tokenizer = AutoTokenizer.from_pretrained("./tinybert_model")

##### Tokenize the dataset

In [18]:
# Tokenize the dataset using the "Description" column
def tokenize_function(examples):
    return tokenizer(
        examples["Description"],
        padding="max_length",   # Pad all sequences to the maximum length
        truncation=True,        # Truncate sequences to max_length
        max_length=128          # Define maximum sequence length
    )
    
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 68/68 [00:00<00:00, 2886.88 examples/s]


##### Load TinyBERT model

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    "./tinybert_model",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./tinybert_model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Set up training arguments

In [20]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)



##### Define Trainer

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

##### Fine-tune the model

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.5264,3.529198
2,3.5247,3.530645
3,3.5207,3.53063


TrainOutput(global_step=51, training_loss=3.524529503841026, metrics={'train_runtime': 323.1217, 'train_samples_per_second': 2.525, 'train_steps_per_second': 0.158, 'total_flos': 2931432136704.0, 'train_loss': 3.524529503841026, 'epoch': 3.0})

##### Save the fine-tuned model

In [23]:
model.save_pretrained("./fine_tuned_tinybert")
tokenizer.save_pretrained("./fine_tuned_tinybert")

print("Model fine-tuning complete!")

Model fine-tuning complete!


##### Evaluate the Model (Optional)

In [None]:
results = trainer.evaluate()
print(results)