# Imports et installation de bibliothèques necéssaires au projet

In [1]:
%pip install accelerate -U
%pip install datasets evaluate transformers[torch] torch torcheval torchmetrics  mosaicml[nlp]
%pip install mlflow wandb pyngrok



In [6]:
import mlflow
from mlflow import MlflowClient

import wandb

from pyngrok import ngrok

import numpy as np
import evaluate

import re
import os
import glob
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# imports venant de torch
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LinearLR


# imports venant de tranformers
import transformers
from transformers import get_scheduler
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification


# imports venant de datasets
import datasets
from datasets import load_dataset
from datasets import Dataset, DatasetDict

# imports venant de mosaic ml
from composer import Trainer
from composer.core import Callback
from composer.loggers import WandBLogger
from composer.models import HuggingFaceModel
from composer.metrics import BinaryF1Score, CrossEntropy
from composer.callbacks import LRMonitor, CheckpointSaver
from torchmetrics.classification import MulticlassAccuracy

In [7]:
# Informations sur les cpu et gpu
from multiprocessing import cpu_count

torch.cuda.empty_cache()

print(torch.cuda.device_count())      # GPU
print(cpu_count())                    # CPU

1
12


In [8]:
## Variables d'environnement pour accéder aux différentes APIs

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Connexion aux différents outils de monitoring, etc

In [9]:
## MlFlow via ngrok

# ngrok.kill()
# NGROK_AUTH_TOKEN = "2ixcAblHEmYTRtDyUOxZBO8nR2p_3Zq8P9bXN4wTBwLTuB23A"
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# public_url = ngrok.connect(addr="5000", proto="http", bind_tls=True)
# get_ipython().system_raw("mlflow ui --port 5000 &")
# print("MLflow Tracking UI:", public_url)

# client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
# experiment_desc = "Poem generation"
# experiment_tags = {
#     "team_lead": "Emeline",
#     "department": "dst",
#     "project": "poem_gen",
#     "mlflow.note.content": experiment_desc
# }

# client.create_experiment("Poem Generation Project", tags=experiment_tags)

In [10]:
## WandD login
wandb.init(project="poem_gen_ft")

[34m[1mwandb[0m: Currently logged in as: [33memeline-caruana[0m ([33mcaruana[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Initialisation des variables pour le modèle

In [11]:
## Récupération du modèle à fine-tune (checkpoint)

checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
datacollator = DataCollatorWithPadding(tokenizer = tokenizer)

### Petit test du modèle avant Fine-tuning

In [12]:
## Définir les paramètres de génération
max_length = 128
num_beams = 4
temperature = 0.1

## Définir le thème ou le style du poème
theme = "Can you write a poem about dogs"

## Préparatin de l'input
encoding = tokenizer.encode_plus(theme,
                                 add_special_tokens=True,
                                 max_length=max_length,
                                 padding='max_length',
                                 truncation=True,
                                 return_attention_mask=True,
                                 return_tensors='pt')

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

## Générer le poème
output = model.generate(input_ids,
                        attention_mask=attention_mask,
                        max_length=max_length,
                        num_beams=num_beams,
                        temperature=temperature)

## Afficher le poème généré
print(tokenizer.decode(output[0], skip_special_tokens=True))



i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love


# Récupération du dataset

In [13]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
## Importation des données sous forme de fichier .json

df = pd.read_json(r"/content/drive/MyDrive/projet/poems_dataset_data_v2.json")

In [15]:
display(df)

Unnamed: 0,path,type,topic,text
0,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My love is pure as honey, made of selective ne..."
1,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,The earth speaks of your discerning and stern ...
2,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My dreams stood naked, behind the burning desi..."
3,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Spring we started planting, after tilling the ..."
4,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Saving the environment, saving the nature\nWe ..."
...,...,...,...,...
10236,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
10237,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
10238,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
10239,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Transformation des données en dataset HF

In [16]:
## Choix de la caractériqtique pour le fine-tuning à faire : topic ou type, à modifier en fonction du choix

## On garde uniquement les données nécessaires au FT (topic ou type)
df_topic = df[df.topic != 'no_topic']
# display(df_topic)

# df_type = df[df.topic != 'no_type']


## Supression de la colonnes avec la caractéristique non utilisées pour le FT
df_topic = df_topic.drop('type', axis=1)
# display(df_topic)

# df_type = df_type.drop('topic', axis=1)
# display(df_type)


## Supression de la colonne "path" afin de ne plus avoir de valeurs str
df_topic.reset_index(inplace=True)
df_topic.rename(columns={'index': 'id'}, inplace=True)
display(df_topic)

paths = df_topic["path"].to_frame()
paths.reset_index(inplace=True)
paths.rename(columns={'index': 'id'}, inplace=True)
paths["id"] = df_topic["id"]

df_topic = df_topic.drop('path', axis=1)

## Séparation des données en train et test
train_data, test_data = train_test_split(df_topic, test_size=0.2)

## Transformation en HF Datasets
tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds.remove_columns(["__index_level_0__"])

Unnamed: 0,id,path,topic,text
0,2771,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"There once was a sister,\nwho loved to kiss he..."
1,2772,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒\nScratch here to r...
2,2773,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,I find it kinda funny\nI find it kinda sad\nTh...
3,2774,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,A server was a man with drinks\nA Notebook was...
4,2775,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"isn't it funny,\nhow the world likes to be?\ni..."
...,...,...,...,...
7465,10236,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Sleep has not visited me the whole night,\nWil..."
7466,10237,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Love-cradling Night, lit by the lucent moon,\n..."
7467,10238,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Bells overbrim with sound\nAnd spread from cup...
7468,10239,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Come Sleep; O Sleep! the certain knot of peace...


DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'text'],
        num_rows: 5976
    })
    test: Dataset({
        features: ['id', 'topic', 'text'],
        num_rows: 1494
    })
})


### Préparation des données pour les utiliser dans le fine-tuning

In [17]:
print(tokenizer.bos_token)
print(tokenizer.pad_token)

None
<pad>


In [18]:
def tokenize_data(examples):
    if tokenizer.bos_token is None:
        bos_token = str(tokenizer.vocab_size + 1)
        tokenizer.add_special_tokens({'bos_token': bos_token})
        tokenizer.bos_token = bos_token

    inputs = tokenizer(examples['text'], padding=True, truncation=True, return_tensors="pt").input_ids.to("cuda")
    targets = tokenizer(examples['topic'], padding=True, truncation=True, return_tensors="pt").input_ids.to("cuda")

    decoder_input_ids = []
    for target in targets:
        decoder_input_ids.append([tokenizer.bos_token_id] + target.tolist())

    return {'input_ids': inputs,
            'attention_mask': inputs.ne(0).int(),
            'decoder_input_ids': decoder_input_ids,
            'labels': targets}


train_data = dataset['train'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])
test_data = dataset['test'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])


print(train_data)
print(train_data[10])

Map:   0%|          | 0/5976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1494 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'decoder_input_ids', 'labels'],
    num_rows: 5976
})
{'id': 9489, 'input_ids': [366, 4272, 2983, 7, 1286, 3, 189, 63, 7069, 1268, 7, 6, 275, 2515, 3481, 13, 1246, 3, 547, 107, 3, 29, 23, 102, 17, 3, 189, 63, 3652, 1084, 6, 366, 2164, 1522, 1727, 3, 189, 63, 239, 24, 470, 964, 7, 6, 275, 66, 7797, 28, 49, 31, 26, 24, 47, 1213, 78, 12537, 6, 37, 29, 240, 48, 1554, 84, 27, 270, 915, 8, 15, 6, 10908, 29, 31, 26, 28, 3, 9, 13966, 59, 66, 73, 20139, 10, 947, 217, 8, 5504, 24, 601, 11, 7913, 3, 6987, 8, 15, 117, 947, 608, 3, 189, 63, 1044, 6, 11, 125, 27, 5696, 31, 26, 21, 8, 15, 5, 100, 164, 2367, 3, 189, 63, 10139, 11832, 6, 4073, 16725, 10836, 485, 164, 28051, 117, 506, 2602, 28, 3, 189, 63, 3, 28977, 33, 59, 1869, 117, 506, 164, 2367, 6, 116, 3, 17, 9492, 11, 27, 1522, 399, 1273, 5, 156, 79, 2367, 6, 258, 3, 17, 9492, 3, 7, 5019, 619, 3, 12550, 117, 328, 56, 2367, 6, 11, 78, 3, 17, 9492, 54, 7, 17, 59, 67, 5, 1, 0, 0, 0, 0, 0,

In [19]:
df_train = pd.DataFrame(train_data.to_dict())
display(df_train.head(5))

Unnamed: 0,id,input_ids,attention_mask,decoder_input_ids,labels
0,7136,"[9259, 13, 7202, 18250, 13, 7966, 5565, 344, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 7966, 1, 0]","[7966, 1, 0]"
1,10003,"[366, 25, 3658, 6, 3, 1007, 258, 7673, 6, 62, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 1997, 1, 0]","[1997, 1, 0]"
2,8886,"[2900, 6, 23958, 2900, 6, 369, 11, 769, 1259, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 1269, 1, 0]","[1269, 1, 0]"
3,4410,"[37, 12737, 17, 9288, 7, 17, 63, 1537, 19224, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 2586, 1, 0]","[2586, 1, 0]"
4,7360,"[37, 8114, 27536, 7, 16, 1442, 387, 5, 1945, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 4033, 1, 0]","[4033, 1, 0]"


In [20]:
## Affichage de quelques poèmes et leur tokenisation

print(ds['train'][10]["text"])
print(len(train_data[10]["input_ids"]))
print(train_data[10]["input_ids"])

When Winter snows upon thy golden hairs,
And frost of age hath nipt thy flowers near,
When dark shall seem thy day that never clears,
And all lies wither'd that was held so dear,
Then take this picture which I here present thee,
Limn'd with a pencil not all unworthy:
Here see the gifts that God and Nature lent thee;
Here read thy self, and what I suffer'd for thee.
This may remain thy lasting monument,
Which happily posterity may cherish;
These colors with thy fading are not spent;
These may remain, when thou and I shall perish.
If they remain, then thou shalt live thereby;
They will remain, and so thou canst not die.
512
[366, 4272, 2983, 7, 1286, 3, 189, 63, 7069, 1268, 7, 6, 275, 2515, 3481, 13, 1246, 3, 547, 107, 3, 29, 23, 102, 17, 3, 189, 63, 3652, 1084, 6, 366, 2164, 1522, 1727, 3, 189, 63, 239, 24, 470, 964, 7, 6, 275, 66, 7797, 28, 49, 31, 26, 24, 47, 1213, 78, 12537, 6, 37, 29, 240, 48, 1554, 84, 27, 270, 915, 8, 15, 6, 10908, 29, 31, 26, 28, 3, 9, 13966, 59, 66, 73, 20139, 1

In [25]:
## Création de data loader
def custom_collate(batch):
    inputs = [example['input_ids'] for example in batch]
    targets = [example['labels'] for example in batch]

    max_len_inputs = max(len(input) for input in inputs)
    max_len_targets = max(len(target) for target in targets)

    padded_inputs = torch.zeros(len(batch), max_len_inputs, dtype=torch.long)
    padded_targets = torch.zeros(len(batch), max_len_targets, dtype=torch.long)

    for i in range(len(batch)):
        padded_inputs[i, :len(inputs[i])] = torch.tensor(inputs[i])
        padded_targets[i, :len(targets[i])] = torch.tensor(targets[i])

    return {'input_ids': padded_inputs, 'labels': padded_targets}


train_loader = DataLoader(train_data, collate_fn = custom_collate, batch_size = 16, shuffle = True, num_workers = cpu_count()-1)
test_loader = DataLoader(test_data, collate_fn = custom_collate, batch_size = 16, shuffle = True, num_workers = cpu_count()-1)

# Fine-tuning du modèle

### Fine-tuning

In [31]:
## Trainer mais de la lib Transformers
from composer.callbacks import CheckpointSaver
from composer.algorithms import GradientClipping

## Choix des métriques
metrics=[CrossEntropy()] #, MulticlassAccuracy(num_classes=2, average='micro')]

class PrintMetricsCallback(Callback):
    def eval_end(self, state, logger):
        metrics = state.eval_metrics['eval']
        for metric_name, metric in metrics.items():
            print(f"{metric_name}: {metric.compute()}")

## Définition du modèle
model_composer = HuggingFaceModel(model, use_logits = True, tokenizer = tokenizer, metrics = metrics)

## Ajustement des paramètres
opt = AdamW(params = model_composer.parameters(), lr = 5e-5,weight_decay = 0.01, betas = (0.0, 0.99))

gc = GradientClipping(clipping_type='norm', clipping_threshold=0.1)

## Fine-tuning du modèle
trainer = Trainer(model= model_composer,
                  train_dataloader= train_loader,
                  eval_dataloader= test_loader,
                  max_duration= '2ep',
                  device= 'gpu' if torch.cuda.is_available() else 'cpu',
                  train_subset_num_batches= 100,
                  callbacks= [LRMonitor(), CheckpointSaver(save_interval='1ep')],
                  loggers= WandBLogger(project="poem_gen_ft"),
                  seed= 20,
                  algorithms= [gc],
                  precision='amp_fp16')

# Training loop or function
try:
    trainer.fit()
finally:
    # Ensure the run is finalized
    wandb.finish()




******************************
Config:
composer_commit_hash: None
composer_version: 0.23.5
enabled_algorithms/GradientClipping: true
node_name: unknown because NODENAME environment variable not set
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 20

******************************
  self.pid = os.fork()


train          Epoch   0:    0%|| 0/100 [00:00<?, ?ba/s]         

VBox(children=(Label(value='0.002 MB of 0.014 MB uploaded\r'), FloatProgress(value=0.15113816834303864, max=1.…

0,1
time/batch,▁
time/batch_in_epoch,▁
time/epoch,▁
time/sample,▁
time/sample_in_epoch,▁
time/token,▁
time/token_in_epoch,▁

0,1
time/batch,0
time/batch_in_epoch,0
time/epoch,0
time/sample,0
time/sample_in_epoch,0
time/token,0
time/token_in_epoch,0


RuntimeError: Expected target size [16, 32128], got [16, 3]