# Fine-Tuning LLMs: Use Case Examples

https://pub.towardsai.net/fine-tuning-llms-use-case-examples-2042d924c5b2

Install requirements

In [None]:
!pip3 install torch --index-url https://download.pytorch.org/whl/cu128
!pip install -q transformers peft accelerate bitsandbytes datasets

## Machine Translation

In [1]:
from datasets import load_dataset

source_lang = "en"
target_lan = "fr"
dataset = load_dataset(
    "kde4", 
    lang1=source_lang, 
    lang2=target_lan, 
    trust_remote_code=True
)

print(f"DATASET TYPE: {type(dataset)}")
print(f"DATASET INFO: {dataset.items()}")
print(f"DATASET COL NAMES: {dataset.column_names}")

  from .autonotebook import tqdm as notebook_tqdm


DATASET TYPE: <class 'datasets.dataset_dict.DatasetDict'>
DATASET INFO: dict_items([('train', Dataset({
    features: ['id', 'translation'],
    num_rows: 210173
}))])
DATASET COL NAMES: {'train': ['id', 'translation']}


In [25]:
# Take three values at random indexes.
dataset["train"][10:13]

{'id': ['10', '11', '12'],
 'translation': [{'en': 'translate', 'fr': 'traduction'},
  {'en': 'The Babel & konqueror; plugin',
   'fr': 'Le module externe Babel pour & konqueror;'},
  {'en': 'Using the Babelfish plugin',
   'fr': 'Utilisation du module externe Babelfish'}]}

In [2]:
split_datasets = dataset["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [3]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [4]:
max_length: int = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs,                 # Define input text.
        text_target=targets,    # Define "labels".
        max_length=max_length, 
        truncation=True,        # Truncate texts to the same size.
    )
    return model_inputs

# Map preprocess function to every dataset (train/validation)
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names, # Remove additional columns (try to run map function with and without remove_columns param to explore the results).
)

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

In [5]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM, Seq2SeqTrainer


model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 

# Data collators are objects that will form a batch by using a list of dataset
# elements as input. These elements are of the same type as the elements of 
# train_dataset or eval_dataset.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

# Create an object for setting training arguments.
model_args = Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-{source_lang}-to-{target_lan}",  # Finetuned model name.
    eval_strategy="epoch",    # Defines when to run evaluation.
    learning_rate=2e-4,             # Learning rate.
    per_device_train_batch_size=8,  # Batch size per GPU for training if GPU is available, else per CPU core.
    per_device_eval_batch_size=8,   # Same thing but for evaluation.
    weight_decay=0.02,              # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
    save_total_limit=3,             # Limits total amount of checkpoints. In this case, after training you will have only three checpoints.
    num_train_epochs=3,             # Total number of training epochs.
    predict_with_generate=True      # Whether to use generate to calculate generative metrics like ROUGE or BLEU.
)

# Create an object for training the LLM.
trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics  # This is a function that is used for computing metrics available at the github repo.
)

# This will start the training process. Be patient :)
trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,1.3399,1.292833
2,1.0288,1.048172
3,0.7249,0.927809




TrainOutput(global_step=70935, training_loss=1.1670829886419456, metrics={'train_runtime': 9808.588, 'train_samples_per_second': 57.854, 'train_steps_per_second': 7.232, 'total_flos': 6060553336848384.0, 'train_loss': 1.1670829886419456, 'epoch': 3.0})

In [6]:
# Now model_checkpoint (name) is the path to the finetuned model.
# Your model is probably saved to the same folder from which you ran the training.
my_model_checkpoint = "Helsinki-NLP/opus-mt-en-fr-finetuned-en-to-fr/checkpoint-70935"
my_model = AutoModelForSeq2SeqLM.from_pretrained(my_model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(my_model_checkpoint, return_tensors="pt")

text = "Hello, my name is Erick!"
tokenized_text = tokenizer(text, return_tensors="pt")
result = my_model.generate(**tokenized_text)
print(tokenizer.decode(result[0], skip_special_tokens=True))



Bonjour, mon nom est Erick & #160;!


To solve this issue, we need to clear the data before tokenizing it:

In [7]:
# Function for ceanining the input texts for the training.
import re

def clean_text(text: str) -> str:
    return re.sub('Â«&#160;', '', text) # You can define your expression/s here.

def preprocess_function(examples):
    inputs = [clean_text(ex["en"]) for ex in examples["translation"]]
    targets = [clean_text(ex["fr"]) for ex in examples["translation"]]
    return tokenizer(
        inputs,                
        text_target=targets,    
        max_length=max_length, 
        truncation=True,       
    )