# Imports et installation de bibliothèques necéssaires au projet

In [1]:
%pip install accelerate -U
%pip install datasets evaluate transformers transformers[torch] torch torcheval torchmetrics mosaicml[nlp]
%pip install mlflow wandb pyngrok

Collecting mlflow
  Using cached mlflow-2.15.1-py3-none-any.whl.metadata (29 kB)
Collecting wandb
  Using cached wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting pyngrok
  Using cached pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Collecting mlflow-skinny==2.15.1 (from mlflow)
  Using cached mlflow_skinny-2.15.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting pyarrow<16,>=4.0.0 (from mlflow)
  Using cached pyarrow-15.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting querystring-parser<2 (from mlflow)
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl.metadata

In [2]:
import re
import os
import glob
import evaluate
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## imports pour le suivi d'expériences
import mlflow
from mlflow import MlflowClient

import wandb

from pyngrok import ngrok


## imports venant de torch
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LinearLR


## imports venant de tranformers
import transformers
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification


## imports venant de datasets
import datasets
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict

## imports venant de mosaic ml
from composer import Trainer
from composer.core import Callback
from composer.loggers import WandBLogger
from composer.models import HuggingFaceModel
from composer.algorithms import GradientClipping
from composer.optim import LinearWithWarmupScheduler
from composer.metrics import CrossEntropy, LanguageCrossEntropy
from composer.callbacks import LRMonitor, CheckpointSaver, EarlyStopper, OOMObserver

In [3]:
## Informations sur les cpu et gpu
from multiprocessing import cpu_count

torch.cuda.empty_cache()

print(torch.cuda.device_count())      # GPU
print(cpu_count())                    # CPU

1
12


In [4]:
## Variables d'environnement pour accéder aux différentes APIs

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Connexion aux différents outils de monitoring, etc

In [5]:
## MlFlow via ngrok

# ngrok.kill()
# NGROK_AUTH_TOKEN = "2ixcAblHEmYTRtDyUOxZBO8nR2p_3Zq8P9bXN4wTBwLTuB23A"
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# public_url = ngrok.connect(addr="5000", proto="http", bind_tls=True)
# get_ipython().system_raw("mlflow ui --port 5000 &")
# print("MLflow Tracking UI:", public_url)

# client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
# experiment_desc = "Poem generation"
# experiment_tags = {
#     "team_lead": "Emeline",
#     "department": "dst",
#     "project": "poem_gen",
#     "mlflow.note.content": experiment_desc
# }

# client.create_experiment("Poem Generation Project", tags=experiment_tags)

In [6]:
## WandD login
wandb.init(project="poem_gen_ft")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Initialisation des variables pour le modèle

In [7]:
## Récupération du modèle à fine-tune (checkpoint)
checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
datacollator = DataCollatorWithPadding(tokenizer = tokenizer)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Petit test du modèle avant Fine-tuning

In [8]:
## Définir les paramètres de génération
max_length = 128
num_beams = 4
temperature = 0.1

## Définir le thème ou le style du poème
theme = "Can you write a poem about dogs"

## Préparatin de l'input
encoding = tokenizer.encode_plus(theme,
                                 add_special_tokens=True,
                                 max_length=max_length,
                                 padding='max_length',
                                 truncation=True,
                                 return_attention_mask=True,
                                 return_tensors='pt')

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

## Générer le poème
output = model.generate(input_ids,
                        attention_mask=attention_mask,
                        max_length=max_length,
                        num_beams=num_beams,
                        temperature=temperature)

## Afficher le poème généré
print(tokenizer.decode(output[0], skip_special_tokens=True))



i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love


# Récupération du dataset

In [9]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

Mounted at /content/drive


In [11]:
## Importation des données sous forme de fichier .json
df = pd.read_json(r"/content/drive/MyDrive/projet/poems_dataset_data_v2.json")

In [12]:
display(df)

Unnamed: 0,path,type,topic,text
0,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My love is pure as honey, made of selective ne..."
1,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,The earth speaks of your discerning and stern ...
2,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My dreams stood naked, behind the burning desi..."
3,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Spring we started planting, after tilling the ..."
4,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Saving the environment, saving the nature\nWe ..."
...,...,...,...,...
10236,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
10237,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
10238,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
10239,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Transformation des données en dataset HF

In [13]:
## Choix de la caractériqtique pour le fine-tuning à faire : topic ou type, à modifier en fonction du choix

## On garde uniquement les données nécessaires au FT (topic ou type)
df_topic = df[df.topic != 'no_topic']
# display(df_topic)

# df_type = df[df.topic != 'no_type']


## Supression de la colonnes avec la caractéristique non utilisées pour le FT
df_topic = df_topic.drop('type', axis=1)
# display(df_topic)

# df_type = df_type.drop('topic', axis=1)
# display(df_type)


## Supression de la colonne "path" afin de ne plus avoir de valeurs str
## Mais création d'un autre dataframe avec les ids des données pour avoir les paths si besoin
df_topic.reset_index(inplace=True)
df_topic.rename(columns={'index': 'id'}, inplace=True)
display(df_topic)

paths = df_topic["path"].to_frame()
paths.reset_index(inplace=True)
paths.rename(columns={'index': 'id'}, inplace=True)
paths["id"] = df_topic["id"]

df_topic = df_topic.drop('path', axis=1)

## Séparation des données en train et test
train_data, test_data = train_test_split(df_topic, test_size=0.2)

## Transformation en HF Datasets
tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds.remove_columns(["__index_level_0__"])

Unnamed: 0,id,path,topic,text
0,2771,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"There once was a sister,\nwho loved to kiss he..."
1,2772,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒\nScratch here to r...
2,2773,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,I find it kinda funny\nI find it kinda sad\nTh...
3,2774,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,A server was a man with drinks\nA Notebook was...
4,2775,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"isn't it funny,\nhow the world likes to be?\ni..."
...,...,...,...,...
7465,10236,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Sleep has not visited me the whole night,\nWil..."
7466,10237,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Love-cradling Night, lit by the lucent moon,\n..."
7467,10238,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Bells overbrim with sound\nAnd spread from cup...
7468,10239,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Come Sleep; O Sleep! the certain knot of peace...


### Préparation des données pour les utiliser dans le fine-tuning

In [32]:
def tokenize_data(examples):
    inputs = tokenizer(examples['text'], padding=True, truncation=True)
    targets = tokenizer(examples['topic'], padding=True, truncation=True)

    return {'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': targets['input_ids']}


train_data = dataset['train'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])
test_data = dataset['test'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])


print(train_data)
print(train_data[10])
print(len(train_data[10]['input_ids']))

Map:   0%|          | 0/5976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1494 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5976
})
{'id': 9592, 'input_ids': [37, 336, 28195, 13, 8, 853, 2801, 6, 214, 72, 150, 13, 333, 1937, 6, 770, 648, 1943, 780, 470, 13006, 6, 8, 2829, 2053, 7424, 53, 70, 2829, 5682, 6, 3, 9, 626, 286, 12, 2561, 11, 1350, 6, 46, 681, 13, 9675, 7, 6, 30135, 7, 11, 6612, 6, 3, 9, 626, 1565, 31, 7, 8173, 12, 90, 152, 116, 8, 12795, 1590, 6, 1633, 2140, 116, 8, 2034, 15825, 124, 7, 7, 31, 30170, 6, 3, 2741, 6399, 11, 3, 9, 720, 13, 31274, 3, 9, 723, 17040, 31, 7, 21691, 6, 4335, 9812, 11, 73, 14577, 179, 1234, 66, 44, 337, 286, 6, 3, 7, 2685, 115, 7428, 8950, 21, 8, 126, 2170, 11603, 7, 6, 30957, 7, 13, 19408, 14193, 1327, 68, 8, 17040, 31, 7, 564, 6, 3, 9, 11237, 1144, 16, 334, 563, 1202, 6, 8, 163, 625, 4999, 113, 734, 7, 4192, 7, 6, 3, 9, 2594, 24, 54, 470, 36, 14244, 45, 82, 842, 55, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [33]:
df_train = pd.DataFrame(train_data.to_dict())
display(df_train.head(5))

Unnamed: 0,id,input_ids,attention_mask,labels
0,8187,"[886, 8352, 16, 1919, 71, 3, 24867, 11949, 5, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[333, 1, 0]"
1,4392,"[411, 2129, 6, 2164, 2586, 6, 438, 39, 6765, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2586, 1, 0]"
2,3827,"[461, 3, 9, 3, 26019, 2608, 27, 361, 7420, 82,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2608, 1, 0]"
3,9864,"[3, 25794, 1796, 140, 3, 9, 1554, 13, 8, 1997,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2827, 1, 0]"
4,3253,"[1138, 2498, 31649, 15, 26, 16, 8, 3533, 44, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[23997, 1, 0]"


In [34]:
## Affichage d'un poème et sa tokenisation

print(ds['train'][10]["text"])
print(train_data[10]["input_ids"])

The last benches of the class rooms,
know more no of love stories,
million times heard yet never boring,
the secret eyes gazing their secret loves,
a perfect place to sit and talk,
an industry of imaginations, fascinations and dreams,
a perfect friend's  shoulder to lean when the tears fall,
cool spot when the window breeze  caress' loneliness,
earphone and a bit of solitude a music lover's paradise,
broken hearts and unbreakable words all at same place,
scribbling pad for the new born poets,
engravings of timeless poem nothing but the lover's name,
a silent member in every group photo,
the only old fellow who understands youths,
a memory that can never be stolen from my heart!
[37, 336, 28195, 13, 8, 853, 2801, 6, 214, 72, 150, 13, 333, 1937, 6, 770, 648, 1943, 780, 470, 13006, 6, 8, 2829, 2053, 7424, 53, 70, 2829, 5682, 6, 3, 9, 626, 286, 12, 2561, 11, 1350, 6, 46, 681, 13, 9675, 7, 6, 30135, 7, 11, 6612, 6, 3, 9, 626, 1565, 31, 7, 8173, 12, 90, 152, 116, 8, 12795, 1590, 6, 1633, 214

In [35]:
## Création de data loader
def custom_collate(batch):
    inputs = [example['input_ids'] for example in batch]
    targets = [example['labels'] for example in batch]

    max_len_inputs = max(len(input) for input in inputs)
    max_len_targets = max(len(target) for target in targets)

    padded_inputs = torch.zeros(len(batch), max_len_inputs, dtype=torch.long)
    padded_targets = torch.zeros(len(batch), max_len_targets, dtype=torch.long)

    for i in range(len(batch)):
        padded_inputs[i, :len(inputs[i])] = torch.tensor(inputs[i])
        padded_targets[i, :len(targets[i])] = torch.tensor(targets[i])
    print("\nCollate\nInput IDs shape:",padded_inputs.shape)
    print("\nLabels shape:",padded_targets.shape)
    return {'input_ids': padded_inputs, 'labels': padded_targets}

# def custom_collate(batch):
#     inputs = [example['input_ids'] for example in batch]
#     targets = [example['labels'] for example in batch]

#     max_len_inputs = max(len(input) for input in inputs)
#     max_len_targets = max(len(target) for target in targets)

#     padded_inputs = torch.full((len(batch), max_len_inputs), tokenizer.pad_token_id, dtype=torch.long)
#     padded_targets = torch.full((len(batch), max_len_targets), tokenizer.pad_token_id, dtype=torch.long)

#     for i in range(len(batch)):
#         padded_inputs[i, :len(inputs[i])] = torch.tensor(inputs[i])
#         padded_targets[i, :len(targets[i])] = torch.tensor(targets[i])

#     print("\nCollate\nInput IDs shape:",padded_inputs.shape)
#     print("\nLabels shape:",padded_targets.shape)
#     return {'input_ids': padded_inputs, 'labels': padded_targets}

print(custom_collate([train_data[0],train_data[5]]))

train_loader = DataLoader(train_data, collate_fn = custom_collate, batch_size = 16, shuffle = True, num_workers = cpu_count()-1)
test_loader = DataLoader(test_data, collate_fn = custom_collate, batch_size = 16, shuffle = True, num_workers = cpu_count()-1)


Collate
Input IDs shape: torch.Size([2, 512])

Labels shape: torch.Size([2, 3])
{'input_ids': tensor([[ 886, 8352,   16,  ...,    0,    0,    0],
        [ 156,   34,  130,  ...,    0,    0,    0]]), 'labels': tensor([[  333,     1,     0],
        [21567,     1,     0]])}


# Fine-tuning du modèle

### Fine-tuning

In [36]:
## Trainer
## Choix des métriques
# bleu_metric = load_metric('bleu')
# rouge_metric = load_metric('rouge')

metrics = [CrossEntropy(), LanguageCrossEntropy()]

class PrintMetricsCallback(Callback):
    def eval_end(self, state, logger):
        metrics = state.eval_metrics['eval']
        for metric_name, metric in metrics.items():
            print(f"{metric_name}: {metric.compute()}")

## Définition du modèle
model_composer = HuggingFaceModel(model, use_logits = True, tokenizer = tokenizer, metrics = metrics)

## Ajustement des paramètres
opt = AdamW(params = model_composer.parameters(), lr = 5e-5,weight_decay = 0.01, betas = (0.0, 0.99))

gc = GradientClipping(clipping_type='norm', clipping_threshold=0.1)

lrscheduler=LinearLR(opt, start_factor=0.1, total_iters=100)

# early_stopping = EarlyStopper('BinaryF1Score', 'my_evaluator', patience=1)


## Fine-tuning du modèle
trainer = Trainer(model= model_composer,
                  train_dataloader= train_loader,
                  eval_dataloader= test_loader,
                  max_duration= '2ep',
                  optimizers=opt,
                  schedulers=[lrscheduler],
                  device= 'gpu' if torch.cuda.is_available() else 'cpu',
                  train_subset_num_batches= 100,
                  callbacks= [LRMonitor(), CheckpointSaver(save_interval='1ep'), OOMObserver(max_entries=100)],
                  loggers= WandBLogger(project="poem_gen_ft"),
                  seed= 20,
                  algorithms= [gc], #early_stopping],
                  precision='amp_fp16')

# Training loop or function
try:
    trainer.fit()
finally:
    # Ensure the run is finalized
    wandb.finish()


******************************
Config:
composer_commit_hash: None
composer_version: 0.23.5
enabled_algorithms/GradientClipping: true
node_name: unknown because NODENAME environment variable not set
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 20

******************************
  self.pid = os.fork()



Collate
Input IDs shape:

Collate
Input IDs shape:  torch.Size([16, 512])torch.Size([16, 512])

Labels shape:
Labels shape: torch.Size([16, 3]) torch.Size([16, 3])


Collate
Input IDs shape: 
Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape:
Collate
Input IDs shape: torch.Size([16, 512]) 
Collate
Input IDs shape: torch.Size([16, 512])  
torch.Size([16, 512])torch.Size([16, 512])torch.Size([16, 512])
Collate
Input IDs shape:
torch.Size([16, 512])

 


Labels shape:
Labels shape:
Collate
Input IDs shape:
Labels shape: 
Labels shape:
Labels shape: 
Labels shape: torch.Size([16, 3])   torch.Size([16, 512])torch.Size([16, 512]) 
torch.Size([16, 3])torch.Size([16, 3])torch.Size([16, 3])

torch.Size([16, 3])torch.Size([16, 3])



Collate
Input IDs shape:


Labels shape: 
Labels shape: torch.Size([16, 3]) 

Collate
Input IDs shape:torch.Size([16, 512])torch.Size([16, 3])

Collate
Input IDs shape: 
Labels shape:  torch.Size([16, 512])torch.Size([16, 512])torch.Size([16

train          Epoch   0:    0%|| 0/100 [00:00<?, ?ba/s]         


Collate
Input IDs shape:
Collate
Input IDs shape: 
Collate
Input IDs shape:  torch.Size([16, 512])torch.Size([16, 512])
torch.Size([16, 512])
Labels shape:
Collate
Input IDs shape:
 
Labels shape:torch.Size([16, 3])  
torch.Size([16, 3])torch.Size([16, 512])



Collate
Input IDs shape:
Collate
Input IDs shape:  
Labels shape:
Labels shape:torch.Size([16, 512]) torch.Size([16, 512])
torch.Size([16, 3])
 
Labels shape:

Collate
Input IDs shape:torch.Size([16, 3]) 
Labels shape:torch.Size([16, 512])
  
torch.Size([16, 3])
Collate
Input IDs shape:
Labels shape:
torch.Size([16, 3])  
torch.Size([16, 3])torch.Size([16, 512])
Collate
Input IDs shape:
Collate
Input IDs shape: 
Collate
Input IDs shape: 

Collate
Input IDs shape:torch.Size([16, 512])
 torch.Size([16, 512])

Collate
Input IDs shape:
Labels shape: 
Labels shape:torch.Size([16, 512]) 
  torch.Size([16, 512])torch.Size([16, 3])torch.Size([16, 3])

Labels shape:torch.Size([16, 512])


Labels shape: torch.Size([16, 3])
Collate
Input 

VBox(children=(Label(value='0.002 MB of 0.014 MB uploaded\r'), FloatProgress(value=0.15074912547026598, max=1.…

0,1
time/batch,▁
time/batch_in_epoch,▁
time/epoch,▁
time/sample,▁
time/sample_in_epoch,▁
time/token,▁
time/token_in_epoch,▁

0,1
time/batch,0
time/batch_in_epoch,0
time/epoch,0
time/sample,0
time/sample_in_epoch,0
time/token,0
time/token_in_epoch,0


RuntimeError: Expected target size [16, 32128], got [16, 3]