# Fine-Tuning LLMs: Use Case Examples

https://pub.towardsai.net/fine-tuning-llms-use-case-examples-2042d924c5b2

Install requirements

In [None]:
!pip3 install torch --index-url https://download.pytorch.org/whl/cu128
!pip install -q transformers peft accelerate bitsandbytes datasets

## Machine Translation

In [1]:
from datasets import load_dataset

source_lang = "en"
target_lan = "fr"
dataset = load_dataset(
    "kde4", 
    lang1=source_lang, 
    lang2=target_lan, 
    trust_remote_code=True
)

print(f"DATASET TYPE: {type(dataset)}")
print(f"DATASET INFO: {dataset.items()}")
print(f"DATASET COL NAMES: {dataset.column_names}")

  from .autonotebook import tqdm as notebook_tqdm


DATASET TYPE: <class 'datasets.dataset_dict.DatasetDict'>
DATASET INFO: dict_items([('train', Dataset({
    features: ['id', 'translation'],
    num_rows: 210173
}))])
DATASET COL NAMES: {'train': ['id', 'translation']}


In [25]:
# Take three values at random indexes.
dataset["train"][10:13]

{'id': ['10', '11', '12'],
 'translation': [{'en': 'translate', 'fr': 'traduction'},
  {'en': 'The Babel & konqueror; plugin',
   'fr': 'Le module externe Babel pour & konqueror;'},
  {'en': 'Using the Babelfish plugin',
   'fr': 'Utilisation du module externe Babelfish'}]}

In [2]:
split_datasets = dataset["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [3]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [4]:
max_length: int = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs,                 # Define input text.
        text_target=targets,    # Define "labels".
        max_length=max_length, 
        truncation=True,        # Truncate texts to the same size.
    )
    return model_inputs

# Map preprocess function to every dataset (train/validation)
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names, # Remove additional columns (try to run map function with and without remove_columns param to explore the results).
)

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

In [5]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM, Seq2SeqTrainer


model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 

# Data collators are objects that will form a batch by using a list of dataset
# elements as input. These elements are of the same type as the elements of 
# train_dataset or eval_dataset.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

# Create an object for setting training arguments.
model_args = Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-{source_lang}-to-{target_lan}",  # Finetuned model name.
    eval_strategy="epoch",    # Defines when to run evaluation.
    learning_rate=2e-4,             # Learning rate.
    per_device_train_batch_size=8,  # Batch size per GPU for training if GPU is available, else per CPU core.
    per_device_eval_batch_size=8,   # Same thing but for evaluation.
    weight_decay=0.02,              # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
    save_total_limit=3,             # Limits total amount of checkpoints. In this case, after training you will have only three checpoints.
    num_train_epochs=3,             # Total number of training epochs.
    predict_with_generate=True      # Whether to use generate to calculate generative metrics like ROUGE or BLEU.
)

# Create an object for training the LLM.
trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics  # This is a function that is used for computing metrics available at the github repo.
)

# This will start the training process. Be patient :)
trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,1.3399,1.292833
2,1.0288,1.048172
3,0.7249,0.927809




TrainOutput(global_step=70935, training_loss=1.1670829886419456, metrics={'train_runtime': 9808.588, 'train_samples_per_second': 57.854, 'train_steps_per_second': 7.232, 'total_flos': 6060553336848384.0, 'train_loss': 1.1670829886419456, 'epoch': 3.0})

In [6]:
# Now model_checkpoint (name) is the path to the finetuned model.
# Your model is probably saved to the same folder from which you ran the training.
my_model_checkpoint = "Helsinki-NLP/opus-mt-en-fr-finetuned-en-to-fr/checkpoint-70935"
my_model = AutoModelForSeq2SeqLM.from_pretrained(my_model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(my_model_checkpoint, return_tensors="pt")

text = "Hello, my name is Erick!"
tokenized_text = tokenizer(text, return_tensors="pt")
result = my_model.generate(**tokenized_text)
print(tokenizer.decode(result[0], skip_special_tokens=True))



Bonjour, mon nom est Erick & #160;!


To solve this issue, we need to clear the data before tokenizing it:

In [7]:
# Function for ceanining the input texts for the training.
import re

def clean_text(text: str) -> str:
    return re.sub('«&#160;', '', text) # You can define your expression/s here.

def preprocess_function(examples):
    inputs = [clean_text(ex["en"]) for ex in examples["translation"]]
    targets = [clean_text(ex["fr"]) for ex in examples["translation"]]
    return tokenizer(
        inputs,                
        text_target=targets,    
        max_length=max_length, 
        truncation=True,       
    )

## Text Classification

In [8]:
dataset = load_dataset("imdb", trust_remote_code=True)
dataset.pop("unsupervised")

print(f"DATASET TYPE: {type(dataset)}")
print(f"DATASET INFO: {dataset.items()}")
print(f"DATASET COL NAMES: {dataset.column_names}")
print(f"DATASET EXAMPLE: \n {dataset['train'][2]}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 165930.20 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 531867.11 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 478410.80 examples/s]


DATASET TYPE: <class 'datasets.dataset_dict.DatasetDict'>
DATASET INFO: dict_items([('train', Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})), ('test', Dataset({
    features: ['text', 'label'],
    num_rows: 25000
}))])
DATASET COL NAMES: {'train': ['text', 'label'], 'test': ['text', 'label']}
DATASET EXAMPLE: 
 {'text': "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />", 'label': 0}


As you can observe from the image above, once again we have a dataset with many HTML tags like “<br /><br>”. We will create a `clear_text` function to remove those tags and run the tokenization process:

In [9]:
from datasets import Dataset

def clear_text(text: str) -> str:
    return re.sub('<[^<]+?>', '', text)

def preprocess_function(examples):    
    inputs = [clear_text(ex) for ex in examples["text"]]
    
    return tokenizer(inputs, truncation=True)

model_checkpoint: str = "distilbert-base-uncased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
tokenized_datasets: Dataset = dataset.map(
    preprocess_function,
    batched=True,
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 25000/25000 [00:04<00:00, 6125.40 examples/s]
Map: 100%|██████████| 25000/25000 [00:04<00:00, 5698.69 examples/s]


In [10]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [13]:
%%time

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

# We will train 5 models with different parameters to find the best one.
finetuned_ids: list[int] = [0, 1, 2, 3, 4]
learning_rates: list[float] = [2e-3, 2e-4, 2e-5, 2e-7, 2e-9]
n_epochs: list[int] = [2, 3, 3, 5, 7]

model_checkpoint: str = "distilbert-base-uncased"

for i, (lr, epoch) in enumerate(zip(learning_rates, n_epochs)):
    
    print(f"ID: {i}, LEARNING_RATE: {lr}, N_EPOCHS: {epoch}")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=2,        # We have only two labels, 0 and 1. 
        id2label=id2label, 
        label2id=label2id
    )    

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

    data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer=tokenizer)
    
    training_args = TrainingArguments(
        output_dir=f"{model_checkpoint}-finetuned-CLASSIFICATION-{i}",
        learning_rate=lr,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=epoch,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True, 
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        #compute_metrics=compute_metrics,
    )
    
    trainer.train()

ID: 0, LEARNING_RATE: 0.002, N_EPOCHS: 2


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6934,0.693151
2,0.6929,0.693202


ID: 1, LEARNING_RATE: 0.0002, N_EPOCHS: 3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6034,0.645581
2,0.6927,0.694243
3,0.6909,0.692175


ID: 2, LEARNING_RATE: 2e-05, N_EPOCHS: 3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2647,0.273405
2,0.1592,0.251801
3,0.0905,0.317244


ID: 3, LEARNING_RATE: 2e-07, N_EPOCHS: 5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.598,0.549153
2,0.3677,0.33761
3,0.3058,0.295876
4,0.2885,0.28589
5,0.2784,0.283783


ID: 4, LEARNING_RATE: 2e-09, N_EPOCHS: 7


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7034,0.701709


KeyboardInterrupt: 

Execution interrumpted since the best results were obtained with `ID: 2, LEARNING_RATE: 2e-05, N_EPOCHS: 3`

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

my_checkpoint: str = "distilbert-base-uncased-finetuned-CLASSIFICATION-2/checkpoint-9375"

tokenizer = AutoTokenizer.from_pretrained(my_checkpoint)
text = "This was great movie!"
inputs = tokenizer(text, return_tensors="pt")


model = AutoModelForSequenceClassification.from_pretrained(my_checkpoint)
with torch.no_grad():
    logits = model(**inputs).logits
    
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'