In [1]:
!pip install transformers datasets accelerate -q


In [2]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)




In [3]:
model_name = "microsoft/deberta-v3-small"


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [6]:
import pandas as pd
df = pd.read_csv("Customer_complaints.csv")
# Display the first 5 rows to verify it loaded correctly
display(df.head())

Unnamed: 0,complaint,category
0,Delay in bank transfer processing,Money Transfers
1,Money sent but not received,Money Transfers
2,Money sent but not received,Money Transfers
3,Delay in bank transfer processing,Money Transfers
4,Credit card payment not reflected,Credit Card


In [7]:
from datasets import Dataset, ClassLabel

dataset = Dataset.from_pandas(
    df[['complaint', 'category']]
    .rename(columns={'complaint': 'text', 'category': 'label'})
)

# Convert the 'label' column to a ClassLabel feature
# First, get the unique categories from the original DataFrame to define the class names
class_names = df['category'].unique().tolist()
# Create the ClassLabel feature
features = dataset.features.copy()
features['label'] = ClassLabel(names=class_names)
dataset = dataset.cast(features)

dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='label')

tokenized_ds = dataset.map(tokenize, batched=True)
tokenized_ds.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(class_names)
)

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

In [10]:
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)

    return {
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.430658,1.0,1.0,1.0
2,No log,0.015493,1.0,1.0,1.0
3,No log,0.008724,1.0,1.0,1.0




TrainOutput(global_step=150, training_loss=0.4458746846516927, metrics={'train_runtime': 2443.7892, 'train_samples_per_second': 0.982, 'train_steps_per_second': 0.061, 'total_flos': 79486105190400.0, 'train_loss': 0.4458746846516927, 'epoch': 3.0})