# Imports et installation de bibliothèques necéssaires au projet

In [1]:
%pip install accelerate -U
%pip install datasets evaluate transformers transformers[torch] torch torcheval torchmetrics mosaicml[nlp]
%pip install mlflow wandb pyngrok



In [2]:
import re
import os
import glob
import evaluate
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## imports pour le suivi d'expériences
import mlflow
from mlflow import MlflowClient

import wandb

from pyngrok import ngrok


## imports venant de torch
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LinearLR


## imports venant de tranformers
import transformers
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification


## imports venant de datasets
import datasets
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict

## imports venant de mosaic ml
from composer import Trainer
from composer.core import Callback
from composer.loggers import WandBLogger
from composer.models import HuggingFaceModel
from composer.algorithms import GradientClipping
from composer.optim import LinearWithWarmupScheduler
from composer.metrics import CrossEntropy, LanguageCrossEntropy
from composer.callbacks import LRMonitor, CheckpointSaver, EarlyStopper, OOMObserver

In [3]:
## Informations sur les cpu et gpu
from multiprocessing import cpu_count

torch.cuda.empty_cache()

print(torch.cuda.device_count())      # GPU
print(cpu_count())                    # CPU

1
12


In [4]:
## Variables d'environnement pour accéder aux différentes APIs

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Connexion aux différents outils de monitoring, etc

In [5]:
## MlFlow via ngrok

# ngrok.kill()
# NGROK_AUTH_TOKEN = "2ixcAblHEmYTRtDyUOxZBO8nR2p_3Zq8P9bXN4wTBwLTuB23A"
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# public_url = ngrok.connect(addr="5000", proto="http", bind_tls=True)
# get_ipython().system_raw("mlflow ui --port 5000 &")
# print("MLflow Tracking UI:", public_url)

# client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
# experiment_desc = "Poem generation"
# experiment_tags = {
#     "team_lead": "Emeline",
#     "department": "dst",
#     "project": "poem_gen",
#     "mlflow.note.content": experiment_desc
# }

# client.create_experiment("Poem Generation Project", tags=experiment_tags)

In [6]:
## WandD login
wandb.init(project="poem_gen_ft")

[34m[1mwandb[0m: Currently logged in as: [33memeline-caruana[0m ([33mcaruana[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Initialisation des variables pour le modèle

In [7]:
## Récupération du modèle à fine-tune (checkpoint)
checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
# model = T5ForConditionalGeneration.from_pretrained(checkpoint, do_sample=True)

datacollator = DataCollatorWithPadding(tokenizer = tokenizer)

### Petit test du modèle avant Fine-tuning

In [8]:
## Définir les paramètres de génération
max_length = 128
num_beams = 4
temperature = 0.1

## Définir le thème ou le style du poème
theme = "Can you write a poem about dogs"

## Préparatin de l'input
encoding = tokenizer.encode_plus(theme,
                                 add_special_tokens=True,
                                 max_length=max_length,
                                 padding='max_length',
                                 truncation=True,
                                 return_attention_mask=True,
                                 return_tensors='pt')

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

## Générer le poème
output = model.generate(input_ids,
                        attention_mask=attention_mask,
                        max_length=max_length,
                        num_beams=num_beams,
                        temperature=temperature)

## Afficher le poème généré
print(tokenizer.decode(output[0], skip_special_tokens=True))



i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love


# Récupération du dataset

In [9]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

Mounted at /content/drive


In [9]:
## Importation des données sous forme de fichier .json
df = pd.read_json(r"/content/drive/MyDrive/projet/poems_dataset_data_v2.json")

In [10]:
display(df)

Unnamed: 0,path,type,topic,text
0,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My love is pure as honey, made of selective ne..."
1,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,The earth speaks of your discerning and stern ...
2,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My dreams stood naked, behind the burning desi..."
3,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Spring we started planting, after tilling the ..."
4,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Saving the environment, saving the nature\nWe ..."
...,...,...,...,...
10236,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
10237,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
10238,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
10239,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Transformation des données en dataset HF

In [11]:
## Choix de la caractériqtique pour le fine-tuning à faire : topic ou type, à modifier en fonction du choix

## On garde uniquement les données nécessaires au FT (topic ou type)
df_topic = df[df.topic != 'no_topic']
# display(df_topic)

# df_type = df[df.topic != 'no_type']


## Supression de la colonnes avec la caractéristique non utilisées pour le FT
df_topic = df_topic.drop('type', axis=1)
# display(df_topic)

# df_type = df_type.drop('topic', axis=1)
# display(df_type)


## Supression de la colonne "path" afin de ne plus avoir de valeurs str
## Mais création d'un autre dataframe avec les ids des données pour avoir les paths si besoin
df_topic.reset_index(inplace=True)
df_topic.rename(columns={'index': 'id'}, inplace=True)
display(df_topic)

paths = df_topic["path"].to_frame()
paths.reset_index(inplace=True)
paths.rename(columns={'index': 'id'}, inplace=True)
paths["id"] = df_topic["id"]

df_topic = df_topic.drop('path', axis=1)

## Séparation des données en train et test
train_data, test_data = train_test_split(df_topic, test_size=0.2)

## Transformation en HF Datasets
tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds.remove_columns(["__index_level_0__"])

Unnamed: 0,id,path,topic,text
0,2771,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"There once was a sister,\nwho loved to kiss he..."
1,2772,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒\nScratch here to r...
2,2773,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,I find it kinda funny\nI find it kinda sad\nTh...
3,2774,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,A server was a man with drinks\nA Notebook was...
4,2775,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"isn't it funny,\nhow the world likes to be?\ni..."
...,...,...,...,...
7465,10236,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Sleep has not visited me the whole night,\nWil..."
7466,10237,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Love-cradling Night, lit by the lucent moon,\n..."
7467,10238,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Bells overbrim with sound\nAnd spread from cup...
7468,10239,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Come Sleep; O Sleep! the certain knot of peace...


### Préparation des données pour les utiliser dans le fine-tuning

In [12]:
def tokenize_data(examples):
    inputs = tokenizer(examples['text'], padding=True, truncation=True)
    targets = tokenizer(examples['topic'], padding=True, truncation=True)

    return {'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': targets['input_ids']}


train_data = dataset['train'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])
test_data = dataset['test'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])


print(train_data)
print(train_data[10])
print(len(train_data[10]['input_ids']))

Map:   0%|          | 0/5976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1494 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5976
})
{'id': 3098, 'input_ids': [1537, 145, 507, 770, 2764, 5452, 3152, 1054, 21, 25, 5, 3259, 2865, 13, 39, 4999, 1076, 11, 887, 6792, 4632, 12, 39, 11122, 5023, 275, 7533, 25, 24951, 5, 4534, 6, 25, 43, 3, 9, 600, 384, 5, 148, 33, 913, 11, 3, 60, 18, 9933, 892, 5, 20855, 11, 3259, 10213, 26000, 15, 26, 71, 1207, 4843, 13, 2827, 7, 81, 69, 750, 3, 102, 2242, 5, 2106, 537, 6, 12738, 6, 24967, 11, 717, 752, 8, 1782, 91, 275, 66, 5573, 10861, 14314, 10, 105, 667, 115, 265, 9, 6, 25, 43, 3, 9, 600, 384, 1141, 13644, 27463, 19, 3, 9, 207, 388, 117, 5301, 3, 9, 207, 21982, 11, 7162, 117, 216, 225, 2367, 16, 8, 7819, 22, 7, 177, 304, 199, 25, 22417, 11, 4106, 100, 248, 684, 5, 216, 19, 3, 9, 207, 4719, 288, 117, 216, 19, 352, 12, 734, 5, 4534, 6, 25, 43, 3, 9, 600, 384, 5, 37, 829, 797, 384, 19, 1187, 25, 6, 886, 724, 131, 333, 4220, 60, 75, 63, 5, 148, 214, 149, 34, 19, 117, 17516, 186, 333, 25, 5, 4534

In [13]:
df_train = pd.DataFrame(train_data.to_dict())
display(df_train.head(5))

Unnamed: 0,id,input_ids,attention_mask,labels
0,3978,"[86, 8, 1015, 13, 27046, 6, 1013, 18, 12425, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3879, 1, 0]"
1,9151,"[71, 786, 239, 3068, 22, 7, 23215, 6, 6331, 33...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5796, 1, 0]"
2,6408,"[3, 11251, 459, 9525, 30, 6869, 17643, 7, 13, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5431, 1, 0]"
3,7341,"[37, 2621, 13, 2129, 3, 18, 3, 19003, 34, 3, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[579, 1, 0]"
4,7094,"[216, 1219, 140, 304, 36, 3385, 1225, 282, 274...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[7966, 1, 0]"


In [14]:
## Affichage d'un poème et sa tokenisation

print(ds['train'][10]["text"])
print(train_data[10]["input_ids"])

More than 18 million proud Americans
Voted for you.
Billions of your fellow men and women
Listened to your acceptance speech
And watched you
Tonight.
Obama, you have a big family.
You are writing and re-writing history.
Hillary and Bill Clinton unleashed
A barrage of truths about our current plight.
Biden, Dean, Kerry and others let the dog out
And all eligible voters shout:
“Obama, you have a big family”.
Senator McCain is a good man;
Being a good soldier and decent;
He should remain in the Senate’s den
To help you govern and command
This great country. He is a good combatant;
He is going to understand.
Obama, you have a big family.
The whole American family is behind you,
Some members just love secrecy.
You know how it is; nevertheless many love you.
Obama, you have a big family.
“The dream shall never die”, said once, Ted.Kennedy.
Bama, the entire world is your family.
King’s dream is alive, that’s the American dream.
We are strong and united. You are like a beam
Of hope for a bette

In [15]:
## Création de data loader
def custom_collate(batch):
    inputs = [example['input_ids'] for example in batch]
    targets = [example['labels'] for example in batch]

    max_len_inputs = max(len(input) for input in inputs)
    max_len_targets = max(len(target) for target in targets)

    padded_inputs = torch.zeros(len(batch), max_len_inputs, dtype=torch.long)
    padded_targets = torch.zeros(len(batch), max_len_targets, dtype=torch.long)

    for i in range(len(batch)):
        padded_inputs[i, :len(inputs[i])] = torch.tensor(inputs[i])
        padded_targets[i, :len(targets[i])] = torch.tensor(targets[i])
    print("\nCollate\nInput IDs shape:",padded_inputs.shape)
    print("\nLabels shape:",padded_targets.shape)
    return {'input_ids': padded_inputs, 'labels': padded_targets}

# def custom_collate(batch):
#     inputs = [example['input_ids'] for example in batch]
#     targets = [example['labels'] for example in batch]

#     max_len_inputs = max(len(input) for input in inputs)
#     max_len_targets = max(len(target) for target in targets)

#     padded_inputs = torch.full((len(batch), max_len_inputs), tokenizer.pad_token_id, dtype=torch.long)
#     padded_targets = torch.full((len(batch), max_len_targets), tokenizer.pad_token_id, dtype=torch.long)

#     for i in range(len(batch)):
#         padded_inputs[i, :len(inputs[i])] = torch.tensor(inputs[i])
#         padded_targets[i, :len(targets[i])] = torch.tensor(targets[i])

#     print("\nCollate\nInput IDs shape:",padded_inputs.shape)
#     print("\nLabels shape:",padded_targets.shape)
#     return {'input_ids': padded_inputs, 'labels': padded_targets}

print(custom_collate([train_data[0],train_data[5]]))

train_loader = DataLoader(train_data, collate_fn = custom_collate, batch_size = 16, shuffle = True, num_workers = cpu_count()-1)
test_loader = DataLoader(test_data, collate_fn = custom_collate, batch_size = 16, shuffle = True, num_workers = cpu_count()-1)


Collate
Input IDs shape: torch.Size([2, 512])

Labels shape: torch.Size([2, 3])
{'input_ids': tensor([[   86,     8,  1015,  ...,     0,     0,     0],
        [20477,   120,  1482,  ...,     0,     0,     0]]), 'labels': tensor([[3879,    1,    0],
        [ 706,    1,    0]])}


# Fine-tuning du modèle

### Fine-tuning

In [16]:
## Trainer
## Choix des métriques
# bleu_metric = load_metric('bleu')
# rouge_metric = load_metric('rouge')

metrics = [CrossEntropy()] #, LanguageCrossEntropy()]

class PrintMetricsCallback(Callback):
    def eval_end(self, state, logger):
        metrics = state.eval_metrics['eval']
        for metric_name, metric in metrics.items():
            print(f"{metric_name}: {metric.compute()}")

## Définition du modèle
model_composer = HuggingFaceModel(model, use_logits = True, tokenizer = tokenizer, metrics = metrics)

## Ajustement des paramètres
opt = AdamW(params = model_composer.parameters(), lr = 5e-5,weight_decay = 0.01, betas = (0.0, 0.99))

gc = GradientClipping(clipping_type='norm', clipping_threshold=0.1)

lrscheduler=LinearLR(opt, start_factor=0.1, total_iters=100)

# early_stopping = EarlyStopper('CrossEntropy', 'my_evaluator', patience=1)


## Fine-tuning du modèle
trainer = Trainer(model= model_composer,
                  train_dataloader= train_loader,
                  eval_dataloader= test_loader,
                  max_duration= '2ep',
                  optimizers=opt,
                  schedulers=[lrscheduler],
                  device= 'gpu' if torch.cuda.is_available() else 'cpu',
                  train_subset_num_batches= 100,
                  callbacks= [LRMonitor(), CheckpointSaver(save_interval='1ep'), OOMObserver(max_entries=100)],
                  loggers= WandBLogger(project="poem_gen_ft"),
                  seed= 20,
                  algorithms= [gc],
                  precision='amp_fp16')

# Training loop or function
try:
    trainer.fit()
finally:
    # Ensure the run is finalized
    wandb.finish()


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

******************************
Config:
composer_commit_hash: None
composer_version: 0.23.5
enabled_algorithms/GradientClipping: true
node_name: unknown because NODENAME environment variable not set
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 20

******************************
  self.pid = os.fork()



Collate
Input IDs shape:
Collate
Input IDs shape: torch.Size([16, 512]) 
torch.Size([16, 512])
Labels shape:
 
Labels shape:torch.Size([16, 3]) 
torch.Size([16, 3])

Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:      torch.Size([16, 512])torch.Size([16, 512])torch.Size([16, 512]) torch.Size([16, 512])torch.Size([16, 512])torch.Size([16, 512])


torch.Size([16, 512])
Collate
Input IDs shape: 



Labels shape:
Labels shape:
 torch.Size([16, 512])
Labels shape:
Labels shape:
Labels shape: 
Labels shape: 
Labels shape:
 torch.Size([16, 512])torch.Size([16, 3])  torch.Size([16, 3]) 
Labels shape:
Collate
Input IDs shape: torch.Size([16, 3])
torch.Size([16, 3])
torch.Size([16, 3])
 
torch.Size([16, 3]) 
Labels shape:

torch.Size([16, 3])
Collate
Input IDs shape:torch.Size([16, 512])
 torch.Size([16, 512])torch.Size([16, 3])

Labels shape:


train          Epoch   0:    0%|| 0/100 [00:00<?, ?ba/s]         


Collate
Input IDs shape: torch.Size([16, 512])
Collate
Input IDs shape: 
torch.Size([16, 512])
Labels shape:
 
Collate
Input IDs shape:torch.Size([16, 3])

Labels shape:
Collate
Input IDs shape:  
Collate
Input IDs shape: 
Collate
Input IDs shape:
Collate
Input IDs shape:torch.Size([16, 512])torch.Size([16, 3])  
Collate
Input IDs shape: torch.Size([16, 512])
 torch.Size([16, 512])
torch.Size([16, 512])

Collate
Input IDs shape:torch.Size([16, 512])torch.Size([16, 512])


Labels shape:
Labels shape:
Labels shape:
Labels shape:
 

Collate
Input IDs shape: torch.Size([16, 512])
Labels shape:   
 
Labels shape:torch.Size([16, 3])torch.Size([16, 3])torch.Size([16, 3]) 
Labels shape:
torch.Size([16, 512])
Collate
Input IDs shape:torch.Size([16, 3]) 

torch.Size([16, 3])  

torch.Size([16, 3])torch.Size([16, 512])

Labels shape:

 torch.Size([16, 3])

Labels shape:torch.Size([16, 3])

Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:   
Collate
Input IDs shape: torc

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
time/batch,▁
time/batch_in_epoch,▁
time/epoch,▁
time/sample,▁
time/sample_in_epoch,▁
time/token,▁
time/token_in_epoch,▁

0,1
time/batch,0
time/batch_in_epoch,0
time/epoch,0
time/sample,0
time/sample_in_epoch,0
time/token,0
time/token_in_epoch,0


RuntimeError: Expected target size [16, 32128], got [16, 3]