<a href="https://colab.research.google.com/github/dsmueller3760/aerospace_chatbot/blob/llm_training/scripts/finetuning_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Notebook created by John Adeojo, Chief Data Scientist & Founder at Data-Centric Solutions.
https://www.data-centric-solutions.com/


In [None]:
# Install dependencies
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install peft bitsandbytes datasets evaluate loralib transformers sentencepiece -q

In [None]:
pip install wandb

In [None]:
import wandb
wandb.login()

In [None]:
MODEL ="xlm-roberta-large"

In [None]:
# Load datasets from Hugging Face Repo

from datasets import load_dataset

dataset = load_dataset("DataCentric/consumer_complaints_cfpb", download_mode="force_redownload")
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train dataset size: 122761
Test dataset size: 30691


In [None]:
# Test dataset
test_dataset = dataset["test"]

# Split the 'train' split of the dataset into a new training set (80%) and a validation set (20%)
train_val_split = dataset['train'].train_test_split(test_size=0.2)

# The returned object is a dictionary that includes two new datasets: 'train' and 'test'
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

In [None]:
dataset = load_dataset("DataCentric/consumer_complaints_cfpb", download_mode="force_redownload")
print(f"Train dataset size: {len(train_dataset)}")
print(f"Talidation dataset size: {len(val_dataset)}")

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/120M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/122761 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/30691 [00:00<?, ? examples/s]

Train dataset size: 98208
Talidation dataset size: 24553


In [None]:
import numpy as np
from transformers import AutoTokenizer
from datasets import ClassLabel

# ensure padding from the correct direction
if any(k in MODEL for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

# Set the Toeknizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples['consumer_complaint_narrative'], truncation=True, max_length=512,  padding='max_length')
    return outputs

# Set the Toeknizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Concatenate the 'subissue' columns from the training, validation, and test datasets
all_subissues = np.concatenate([train_dataset['subissue'], val_dataset['subissue'], test_dataset['subissue']])

# Get the list of unique labels from the 'subissue' column
unique_subissues = np.unique(all_subissues)

# Create a ClassLabel object with these unique labels
label = ClassLabel(names=unique_subissues.tolist())

# Encode the 'subissue' column to integers in all datasets
train_dataset = train_dataset.map(lambda examples: {'labels': label.str2int(examples['subissue'])}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': label.str2int(examples['subissue'])}, batched=True)
test_dataset = test_dataset.map(lambda examples: {'labels': label.str2int(examples['subissue'])}, batched=True)

# Tokenize the 'consumer_complaint_narrative' column in all datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(['subissue', 'consumer_complaint_narrative'])
val_dataset = val_dataset.remove_columns(['subissue', 'consumer_complaint_narrative'])

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/98208 [00:00<?, ? examples/s]

Map:   0%|          | 0/24553 [00:00<?, ? examples/s]

Map:   0%|          | 0/30691 [00:00<?, ? examples/s]

Map:   0%|          | 0/98208 [00:00<?, ? examples/s]

Map:   0%|          | 0/24553 [00:00<?, ? examples/s]

Map:   0%|          | 0/30691 [00:00<?, ? examples/s]

In [None]:
# 'label' is the ClassLabel object you created earlier
label_names = label.names

# Generate the label2id dictionary
label2id = {label: idx for idx, label in enumerate(label_names)}

# Generate the id2label dictionary
id2label = {idx: label for idx, label in enumerate(label_names)}


In [None]:
# Print out a single example from the train_dataset
print(f"Example from train_dataset: {train_dataset[0]}")


Example from train_dataset: {'labels': 204, 'input_ids': [0, 24372, 25975, 26518, 10696, 23, 14214, 64, 42918, 64, 205378, 4, 87, 26859, 161474, 759, 9836, 111628, 7, 23, 70, 6817, 111, 450, 6602, 5, 3311, 31576, 111, 14214, 64, 42918, 64, 205378, 4, 87, 80723, 161474, 1829, 142, 360, 45738, 44978, 71, 853, 21284, 674, 5586, 5, 80799, 15363, 66, 1902, 24209, 759, 137474, 12349, 42, 5, 88949, 87, 444, 6, 205378, 136, 3249, 10, 4552, 19336, 4559, 1294, 4, 759, 81997, 7, 3542, 10666, 4369, 37509, 8152, 4, 136, 87, 2806, 8783, 3060, 33946, 77253, 538, 5, 87, 509, 11343, 190238, 100, 70, 1528, 221, 1902, 110, 2967, 5, 360, 14214, 64, 42918, 64, 205378, 87, 37842, 47, 101089, 47, 10, 627, 72565, 46445, 1774, 678, 80799, 15363, 66, 4, 3129, 1556, 161549, 11522, 27289, 4, 136, 442, 1556, 2809, 115839, 1810, 111, 6226, 5, 87, 80723, 47, 53299, 2304, 18403, 31667, 117729, 4, 6044, 237, 450, 87, 36, 24243, 1672, 10666, 4369, 11663, 37509, 8152, 10, 31150, 4, 3129, 83, 23552, 759, 176735, 91763, 4

In [None]:
print(f"Number of examples in the dataset: {len(train_dataset)}")
print(f"Length of input_ids for the first example: {len(train_dataset['input_ids'][0])}")
print(f"Label for the first example: {train_dataset['labels'][0]}")

Number of examples in the dataset: 98208
Length of input_ids for the first example: 512
Label for the first example: 204


In [None]:
# Setting up the LoRA model
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
    task_type="SEQ_CLS",
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],  # Targets the attention blocks in the model
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(unique_subissues),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

lora_model = get_peft_model(model, config)
lora_model.print_trainable_parameters()

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,110,764 || all params: 562,951,596 || trainable%: 0.7302162440267778


In [None]:
import transformers
import evaluate
from datasets import load_dataset, load_metric
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro"))
    results.update(recall_metric.compute(predictions=predictions, references = labels, average="macro"))
    results.update(accuracy_metric.compute(predictions=predictions, references = labels))
    results.update(precision_metric.compute(predictions=predictions, references = labels, average="macro"))

    return results

trainer = transformers.Trainer(
    model=lora_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,
        warmup_steps=100,
        max_steps=49104,
        learning_rate=2e-4,
        fp16=True,
        eval_steps= 4000,
        logging_steps=4000,
        save_steps=4000,
        evaluation_strategy="steps",
        do_eval=True,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        output_dir='model_outputs',
        logging_dir='model_outputs',
        remove_unused_columns =False,
        report_to='wandb'  # enable logging to W&B
    ),
)
# lora_model.config.use_cache = True  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss,Validation Loss,F1,Recall,Accuracy,Precision
4000,2.0568,1.898691,0.131587,0.139869,0.495581,0.170082
8000,1.8702,1.795238,0.172111,0.173243,0.520873,0.204907


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss,Validation Loss,F1,Recall,Accuracy,Precision
4000,2.0568,1.898691,0.131587,0.139869,0.495581,0.170082
8000,1.8702,1.795238,0.172111,0.173243,0.520873,0.204907
12000,1.7818,1.739861,0.186763,0.193219,0.529915,0.23602
16000,1.656,1.684328,0.206615,0.215833,0.541441,0.249443
20000,1.6113,1.661528,0.216771,0.22731,0.548894,0.247211
24000,1.5428,1.655188,0.211382,0.221287,0.549872,0.248735
28000,1.475,1.626001,0.235068,0.238976,0.558425,0.2787
32000,1.4385,1.611011,0.235,0.234852,0.558425,0.271663
36000,1.3971,1.595749,0.245446,0.252106,0.564534,0.281493
40000,1.3306,1.590253,0.244853,0.249233,0.56486,0.275978


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=49104, training_loss=1.556853218861689, metrics={'train_runtime': 29613.5662, 'train_samples_per_second': 26.531, 'train_steps_per_second': 1.658, 'total_flos': 7.395682772362199e+17, 'train_loss': 1.556853218861689, 'epoch': 8.0})