In [1]:
from datasets import load_dataset

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

from peft import LoraConfig, get_peft_model

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
nltk.download('stopwords')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
crop=0.1
num_train_epochs = 1
r=8

batch_size = 2

max_input_length = 1024
max_target_length = 64
model_checkpoint = "bigscience/mt0-large"



model_name = f"{model_checkpoint}-finetune"
out_dir = f"/Data/{model_name}"
rouge_score = evaluate.load("rouge")



2024-03-15 17:55:47.143899: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-15 17:55:47.143931: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-15 17:55:47.144524: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-15 17:55:47.148148: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data] 

In [2]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map="auto", torch_dtype=torch.float16)

In [3]:
dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'validation': '../data/validation.csv'})
train_sample_size = int(crop * len(dataset['train']))
validation_sample_size = int(crop * len(dataset['validation']))
dataset['train'] = dataset['train'].shuffle().select(range(train_sample_size))
dataset['validation'] = dataset['validation'].shuffle().select(range(validation_sample_size))

def prompt(text):
    return f"Créez un titre: \n{text}"

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def preprocess_function(examples):
    model_inputs = tokenizer(
        [prompt(text) for text in examples["text"]],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["titles"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2140 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [4]:
import re
pattern = r'\((\w+)\): Linear'
linear_layers = re.findall(pattern, str(model.modules))
target_modules = list(set(linear_layers))

lora_config = LoraConfig(
    r=r,
    target_modules = target_modules,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 11151360 || all params: 1240732672 || trainable%: 0.8987721732212111


In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {k: round(v, 4) for k, v in result.items()}
    print(result)
    return result

args = Seq2SeqTrainingArguments(
    output_dir=out_dir,
    evaluation_strategy="steps",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=100
)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [6]:
trainer.evaluate()



{'rouge1': 0.2132, 'rouge2': 0.0829, 'rougeL': 0.1734, 'rougeLsum': 0.1792}


{'eval_loss': 1.8427734375,
 'eval_rouge1': 0.2132,
 'eval_rouge2': 0.0829,
 'eval_rougeL': 0.1734,
 'eval_rougeLsum': 0.1792,
 'eval_runtime': 77.1402,
 'eval_samples_per_second': 1.945,
 'eval_steps_per_second': 0.972}

In [9]:
trainer.train()

Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 62.00 MiB. GPU 0 has a total capacity of 23.66 GiB of which 25.00 MiB is free. Including non-PyTorch memory, this process has 23.52 GiB memory in use. Of the allocated memory 23.16 GiB is allocated by PyTorch, and 104.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def predict(text, model):
    tokens = tokenizer(text, return_tensors='pt').to(device)
    output_tokens=model.generate(**tokens, max_new_tokens=max_target_length)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

def print_summary(idx, model):
    review = dataset["validation"][idx]["text"]
    title = dataset["validation"][idx]["titles"]
    text = prompt(dataset["validation"][idx]['text'])
    summary = predict(text, model)
    print(f"'>>> Text: {review}'")
    print(f"\n>>> Title: {title}")
    print(f"\n>>> predicted: {summary}")

print_summary(1, model)

'>>> Text: Carlos Ghosn a clamé son "innocence" dans une vidéo diffusée par ses avocats mardi 9 avril. Celle-ci avait été enregistrée juste avant son arrestation le 4 avril sur de nouveaux soupçons de malversations financières. Sur ces images, Carlos Ghosn accuse les dirigeants de Nissan de trahison. Toutefois, l'ancien PDG de Renault-Nissan, ne cite aucun nom. Les noms des responsables désignés par Carlos Ghosn ont été coupés au montage, sur demande de ses avocats. "Ce n'est pas une histoire de cupidité, de dictature d'un homme. C'est une histoire de complot, de conspiration, de trahison", a-t-il déclaré dans ce message.Un recours devant la Cour suprême Pour Carlos Ghosn, les dirigeants de Nissan ont craint la perte d'autonomie du constructeur japonais. "On peut se demander pourquoi un tel complot a eu lieu. Et bien, parce qu'il y a eu de la peur liée à la convergence, à une alliance, à une fusion plus profonde. Certaines personnes ont eu peur par rapport à l'autonomie de Nissan". Son