In [1]:
import torch
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model
import evaluate
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, BitsAndBytesConfig, AutoModelForSeq2SeqLM

import matplotlib.pyplot as plt
from utils import prepare_prompt, print_trainable_parameters
import transformers
from tqdm import tqdm

import numpy as np
import json

## Dataset

In [2]:
dataset_raw = load_dataset('json', data_files='dataset_llm_generated.json')
dataset = dataset_raw.select_columns(["text", "summary"])
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 5000
    })
})


In [3]:

dataset['train'] = dataset['train'].filter(lambda x: len(x['text'].split()) <= 2500)
dataset['train'] = dataset['train'].filter(lambda x: len(x['text'].split()) >= 200)
dataset['train'] = dataset['train'].filter(lambda x: len(x['summary'].split()) >= 40)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 4426
    })
})


In [4]:
split_train_temp = dataset["train"].train_test_split(test_size=0.4, seed=42)

split_valid_test = split_train_temp["test"].train_test_split(test_size=0.5, seed=42)

dataset_split = DatasetDict({
    "train": split_train_temp["train"],        
    "validation": split_valid_test["train"],      
    "test": split_valid_test["test"]              
})

print(dataset_split)

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 2655
    })
    validation: Dataset({
        features: ['text', 'summary'],
        num_rows: 885
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 886
    })
})


## Fine Tune

In [5]:
cache_dir = "/Data/gabriel-mercier/slm_models"

tokenizer = AutoTokenizer.from_pretrained("google/mt5-base", cache_dir=cache_dir)

bnb_config = BitsAndBytesConfig(load_in_4bit=True, 
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_compute_dtype=torch.bfloat16,
                                bnb_4bit_quant_type='nf4',
                            )
model_raw = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base", 
                                              cache_dir=cache_dir,
                                              trust_remote_code=True,
                                              quantization_config=bnb_config,
                                              device_map="auto")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
lora_finetune = True

if lora_finetune:
    lora_config = LoraConfig(r=16, 
                            lora_alpha=32,
                            target_modules=["q", "k", "v", "o"],
                            lora_dropout=0.05,
                            bias='none',
                            task_type="SEQ_2_SEQ_LM")

    model = get_peft_model(model_raw, lora_config)
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [7]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config.do_sample = True

print_trainable_parameters(model)

trainable params: 3538944 || all params: 505724160 || trainable%: 0.6997775229880258


### Example


In [8]:
assistant_start = "Résumé concis et structuré (100 mots maximum) :"
summary_data = dataset_split['train'][1]['summary']
prompt = prepare_prompt(dataset_split['train'][1], summary_included=False)
print('=== PROMPT ===')
print(prompt)

encoding = tokenizer(prompt, return_tensors="pt").to(device)

with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

start_index = prediction.find(assistant_start)
if start_index != -1:
    response_start = start_index + len(assistant_start)
else:
    response_start = -1 

print("=== GENERATED SUMMARY ===")
print(prediction[response_start+1:])
print(len(prediction[response_start+1:].split()))

print("=== LABEL SUMMARY ===")
print(summary_data)
print(len(summary_data.split()))

=== PROMPT ===
Résume précisément le texte suivant en français en 100 mots maximum. Concentre-toi sur les points essentiels sans ajouter d'opinions ni de commentaires. Évite les phrases inutiles et reformule les idées clairement.

Texte :
Le 24 août 1991, des Moscovites accompagnent les dépouilles des trois victimes du putsch, tuées trois jours plus tôt. Gueorgui Pinkhassov/Magnum C’était encore le temps des vacances, du repos dans les datchas. Le major-colonel du KGB Valeri Chiriaïev dormait dans la sienne, une bicoque sans eau, quelque part dans un trou perdu de la région de Iaroslav, quand la « boîte » fixée au mur s’est mise à grésiller. « On ne savait même plus si ce machin marchait encore », raconte-t-il. L’antique haut-parleur avait été installé dans toutes les demeures soviétiques pour prévenir d’une guerre nucléaire. Le message contenait aussi un code secret pour toutes les forces de sécurité. Et ce matin du 19 août 1991, il y a vingt-cinq ans, Valeri Chiriaïev découvre, stupé

In [9]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [10]:
dataset_train = dataset_split["train"].map(preprocess_function)
dataset_val = dataset_split["validation"].map(preprocess_function)

dataset_train = dataset_train.remove_columns(["text", "summary"])
dataset_val = dataset_val.remove_columns(["text", "summary"])

print(dataset_train)
print(dataset_val)

Map:   0%|          | 0/2655 [00:00<?, ? examples/s]

Map:   0%|          | 0/885 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2655
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 885
})


In [11]:

# Create a data collator for seq2seq tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [33]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir="./mt5_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset_train.select(range(50)),
    eval_dataset=dataset_val.select(range(10)),
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.7602,2.792951
2,4.4456,2.770083


TrainOutput(global_step=36, training_loss=3.6252304216225943, metrics={'train_runtime': 35.0735, 'train_samples_per_second': 4.277, 'train_steps_per_second': 1.026, 'total_flos': 42347121868800.0, 'train_loss': 3.6252304216225943, 'epoch': 2.8})

In [22]:
trainer.save_model("./encoder_decoder")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("./autoregressive_model")
model.to(device)

## Evaluation

In [14]:
assistant_start = "Résumé concis et structuré (100 mots maximum) :"
summary_data = dataset_split['train'][1]['summary']
prompt = prepare_prompt(dataset_split['train'][1], summary_included=False)
print('=== PROMPT ===')
print(prompt)

encoding = tokenizer(prompt, return_tensors="pt").to(device)

with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

start_index = prediction.find(assistant_start)
if start_index != -1:
    response_start = start_index + len(assistant_start)
else:
    response_start = -1 

print("=== GENERATED SUMMARY ===")
print(prediction[response_start+1:])
print(len(prediction[response_start+1:].split()))

print("=== LABEL SUMMARY ===")
print(summary_data)
print(len(summary_data.split()))

=== PROMPT ===
Résume précisément le texte suivant en français en 100 mots maximum. Concentre-toi sur les points essentiels sans ajouter d'opinions ni de commentaires. Évite les phrases inutiles et reformule les idées clairement.

Texte :
Le 24 août 1991, des Moscovites accompagnent les dépouilles des trois victimes du putsch, tuées trois jours plus tôt. Gueorgui Pinkhassov/Magnum C’était encore le temps des vacances, du repos dans les datchas. Le major-colonel du KGB Valeri Chiriaïev dormait dans la sienne, une bicoque sans eau, quelque part dans un trou perdu de la région de Iaroslav, quand la « boîte » fixée au mur s’est mise à grésiller. « On ne savait même plus si ce machin marchait encore », raconte-t-il. L’antique haut-parleur avait été installé dans toutes les demeures soviétiques pour prévenir d’une guerre nucléaire. Le message contenait aussi un code secret pour toutes les forces de sécurité. Et ce matin du 19 août 1991, il y a vingt-cinq ans, Valeri Chiriaïev découvre, stupé

In [15]:
dataset_test = dataset_split['test']

In [16]:
rouge = evaluate.load("rouge")
bert_score = evaluate.load("bertscore")


In [17]:
def evaluate_model(model, dataset):
    summaries = [data_point['summary'] for data_point in dataset]
    predictions = []

    for data_point in tqdm(dataset):
        prompt = prepare_prompt(data_point, summary_included=False)
        encoding = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.inference_mode():
            output = model.generate(
                input_ids=encoding.input_ids,
                attention_mask=encoding.attention_mask,
                generation_config=generation_config,
            )
            
        prediction = tokenizer.decode(output[0], skip_special_tokens=True)
        response_start = prediction.find(assistant_start)
        predictions.append(prediction[response_start:])

    rouge_results = rouge.compute(predictions=predictions, references=summaries)
    bert_results = bert_score.compute(predictions=predictions, references=summaries, lang="fr")

    bert_precision = np.mean(bert_results['precision'])
    bert_recall = np.mean(bert_results['recall'])
    bert_f1 = np.mean(bert_results['f1'])

    print(f"BERTScore - Precision: {bert_precision:.4f}, Recall: {bert_recall:.4f}, F1: {bert_f1:.4f}")
    print(f"ROUGEScores - {rouge_results}")
    print('\n')
    
    return rouge_results, {'Precision':bert_precision, 'Recall':bert_recall, 'F1':bert_f1}


In [18]:
dataset_test_1 = dataset_test.select(range(10))

In [19]:
rouges_results_finetune, bert_results_finetune = evaluate_model(model, dataset_test_1)

100%|██████████| 10/10 [00:23<00:00,  2.40s/it]


BERTScore - Precision: 0.5700, Recall: 0.4051, F1: 0.4732
ROUGEScores - {'rouge1': np.float64(0.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0), 'rougeLsum': np.float64(0.0)}






In [20]:
results_finetune = {
    "rouge": rouges_results_finetune,
    "bert": bert_results_finetune
}

with open("mt5_evaluation_results_finetune.json", "w") as f:
    json.dump(results_finetune, f, indent=4)


In [21]:
rouges_results_raw, bert_results_raw = evaluate_model(model_raw, dataset_test_1)

results_raw = {
    "rouge": rouges_results_raw,
    "bert": bert_results_raw
}

with open("mt5_evaluation_results_raw.json", "w") as f:
    json.dump(results_raw, f, indent=4)

100%|██████████| 10/10 [00:20<00:00,  2.04s/it]

BERTScore - Precision: 0.6403, Recall: 0.4503, F1: 0.5285
ROUGEScores - {'rouge1': np.float64(0.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0), 'rougeLsum': np.float64(0.0)}





