# Imports et installation de bibliothèques necéssaires au projet

In [1]:
%pip install accelerate -U
%pip install datasets evaluate transformers[torch] torch torcheval torchmetrics  mosaicml[nlp]
%pip install mlflow wandb pyngrok



In [2]:
import mlflow
from mlflow import MlflowClient

from pyngrok import ngrok

import numpy as np
import evaluate

import re
import os
import glob
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# imports venant de torch
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LinearLR


# imports venant de tranformers
import transformers
from transformers import get_scheduler
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification


# imports venant de datasets
import datasets
from datasets import load_dataset
from datasets import Dataset, DatasetDict

# imports venant de mosaic ml
from composer import Trainer
from composer.core import Callback
from composer.callbacks import LRMonitor
from composer.loggers import WandBLogger
from composer.models import HuggingFaceModel
from composer.metrics import BinaryF1Score, CrossEntropy
from torchmetrics.classification import MulticlassAccuracy

In [3]:
# Informations sur les cpu et gpu
from multiprocessing import cpu_count

print(torch.cuda.device_count())      # GPU
print(cpu_count())                    # CPU

1
8


In [4]:
## Variables d'environnement pour accéder aux différentes APIs

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
os.environ["COMET_LOG_ASSETS"] = "True"
os.environ["COMET_API_KEY"] = "g9Um8JaLLAjkjVKYPZjYLXvcP"
os.environ["COMET_PROJECT_NAME"] = "poem-generation"

# Connexion aux différents outils de monitoring, etc

In [5]:
## MlFlow via ngrok

# ngrok.kill()
# NGROK_AUTH_TOKEN = "2ixcAblHEmYTRtDyUOxZBO8nR2p_3Zq8P9bXN4wTBwLTuB23A"
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# public_url = ngrok.connect(addr="5000", proto="http", bind_tls=True)
# get_ipython().system_raw("mlflow ui --port 5000 &")
# print("MLflow Tracking UI:", public_url)

# client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
# experiment_desc = "Poem generation"
# experiment_tags = {
#     "team_lead": "Emeline",
#     "department": "dst",
#     "project": "poem_gen",
#     "mlflow.note.content": experiment_desc
# }

# client.create_experiment("Poem Generation Project", tags=experiment_tags)

In [6]:
## WandD login


# Initialisation des variables pour le modèle

In [7]:
## Récupération du modèle à fine-tune (checkpoint)

checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
datacollator = DataCollatorWithPadding(tokenizer = tokenizer)

### Petit test du modèle avant Fine-tuning

In [8]:
## Définir les paramètres de génération
max_length = 128
num_beams = 4
temperature = 0.1

## Définir le thème ou le style du poème
theme = "Can you write a poem about dogs"

## Préparatin de l'input
encoding = tokenizer.encode_plus(theme,
                                 add_special_tokens=True,
                                 max_length=max_length,
                                 padding='max_length',
                                 truncation=True,
                                 return_attention_mask=True,
                                 return_tensors='pt')

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

## Générer le poème
output = model.generate(input_ids,
                        attention_mask=attention_mask,
                        max_length=max_length,
                        num_beams=num_beams,
                        temperature=temperature)

## Afficher le poème généré
print(tokenizer.decode(output[0], skip_special_tokens=True))



i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love


# Récupération du dataset

In [9]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
## Importation des données sous forme de fichier .json

df = pd.read_json(r"/content/drive/MyDrive/projet/poems_dataset_data_v2.json")

In [11]:
display(df)

Unnamed: 0,path,type,topic,text
0,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My love is pure as honey, made of selective ne..."
1,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,The earth speaks of your discerning and stern ...
2,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My dreams stood naked, behind the burning desi..."
3,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Spring we started planting, after tilling the ..."
4,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Saving the environment, saving the nature\nWe ..."
...,...,...,...,...
10236,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
10237,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
10238,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
10239,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Transformation des données en dataset HF

In [15]:
## Choix de la caractériqtique pour le fine-tuning à faire : topic ou type, à modifier en fonction du choix

## On garde uniquement les données nécessaires au FT (topic ou type)
df_topic = df[df.topic != 'no_topic']
# display(df_topic)

# df_type = df[df.topic != 'no_type']


## Supression de la colonnes avec la caractéristique non utilisées pour le FT
df_topic = df_topic.drop('type', axis=1)
# display(df_topic)

# df_type = df_type.drop('topic', axis=1)
# display(df_type)


## Supression de la colonne "path" afin de ne plus avoir de valeurs str
df_topic.reset_index(inplace=True)
df_topic.rename(columns={'index': 'id'}, inplace=True)
display(df_topic)

paths = df_topic["path"].to_frame()
paths.reset_index(inplace=True)
paths.rename(columns={'index': 'id'}, inplace=True)
paths["id"] = df_topic["id"]

df_topic = df_topic.drop('path', axis=1)

## Séparation des données en train et test
train_data, test_data = train_test_split(df_topic, test_size=0.2)

tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds.remove_columns(["__index_level_0__"])
print(dataset)

Unnamed: 0,id,path,topic,text
0,2771,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"There once was a sister,\nwho loved to kiss he..."
1,2772,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒\nScratch here to r...
2,2773,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,I find it kinda funny\nI find it kinda sad\nTh...
3,2774,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,A server was a man with drinks\nA Notebook was...
4,2775,/content/drive/MyDrive/POEI/projet/poems_datas...,funny,"isn't it funny,\nhow the world likes to be?\ni..."
...,...,...,...,...
7465,10236,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Sleep has not visited me the whole night,\nWil..."
7466,10237,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,"Love-cradling Night, lit by the lucent moon,\n..."
7467,10238,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Bells overbrim with sound\nAnd spread from cup...
7468,10239,/content/drive/MyDrive/POEI/projet/poems_datas...,summer,Come Sleep; O Sleep! the certain knot of peace...


DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'text'],
        num_rows: 5976
    })
    test: Dataset({
        features: ['id', 'topic', 'text'],
        num_rows: 1494
    })
})


### Préparation des données pour les utiliser dans le fine-tuning

In [16]:
print(tokenizer.bos_token)

None


In [17]:
def tokenize_data(examples):
  if tokenizer.bos_token is None:
    bos_token = str(tokenizer.vocab_size + 1)
    tokenizer.add_special_tokens({'bos_token': bos_token})
    tokenizer.bos_token = bos_token

  inputs = tokenizer(examples['text'], padding=True, truncation=True)
  targets = tokenizer(examples['topic'], padding=True, truncation=True)

  decoder_input_ids = []
  for target in targets['input_ids']:
    decoder_input_ids.append([tokenizer.bos_token_id] + target)

  return {'input_ids': inputs['input_ids'],
          'attention_mask': inputs['attention_mask'],
          'decoder_input_ids': decoder_input_ids,
          'labels': targets['input_ids']}


train_data = dataset['train'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])
test_data = dataset['test'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])


print(train_data)
print(train_data[10])

Map:   0%|          | 0/5976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1494 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'decoder_input_ids', 'labels'],
    num_rows: 5976
})
{'id': 7941, 'input_ids': [37, 4903, 13, 333, 27, 7, 116, 8, 2335, 148, 31, 60, 28, 27, 7, 8, 163, 2335, 86, 8, 296, 5, 37, 4903, 13, 14210, 27, 7, 116, 8, 2335, 148, 31, 60, 28, 27, 7, 334, 2335, 86, 8, 296, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [18]:
df_train = pd.DataFrame(train_data.to_dict())
display(df_train.head(5))

Unnamed: 0,id,input_ids,attention_mask,decoder_input_ids,labels
0,7869,"[3385, 565, 764, 44, 1379, 333, 733, 406, 414,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 897, 1, 0]","[897, 1, 0]"
1,7778,"[96, 2962, 2176, 17, 9, 3, 354, 4348, 76, 17, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 160, 32, 1]","[160, 32, 1]"
2,7953,"[94, 7797, 16, 66, 13, 178, 6, 13190, 53, 6, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 14210, 1, 0]","[14210, 1, 0]"
3,7465,"[2194, 2452, 34, 3, 2, 7, 118, 3, 9, 307, 614,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 21567, 1, 0]","[21567, 1, 0]"
4,4000,"[6887, 15, 13, 82, 180, 3801, 49, 6, 113, 78, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 3879, 1, 0]","[3879, 1, 0]"


In [19]:
## Affichage de quelques poèmes et leur tokenisation

print(ds['train'][10]["text"])
print(len(train_data[10]["input_ids"]))
print(train_data[10]["input_ids"])

print("\n",ds['train'][15]["text"])
print(len(train_data[15]["input_ids"]))
print(train_data[15]["input_ids"])

The definition of love
Is when the woman
You're with
Is the only woman
In the world.
The definition of romance
Is when the woman
You're with
Is every woman
In the world.
512
[37, 4903, 13, 333, 27, 7, 116, 8, 2335, 148, 31, 60, 28, 27, 7, 8, 163, 2335, 86, 8, 296, 5, 37, 4903, 13, 14210, 27, 7, 116, 8, 2335, 148, 31, 60, 28, 27, 7, 334, 2335, 86, 8, 296, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [24]:
## Création de data loader

train_loader = DataLoader(train_data, collate_fn = datacollator, batch_size = 32,
                          shuffle = True, num_workers = cpu_count()-1)
test_loader = DataLoader(test_data, collate_fn = datacollator, batch_size = 32,
                         shuffle = True, num_workers = cpu_count()-1)

# Fine-tuning du modèle

### Fine-tuning

In [25]:
## Trainer mais de la lib Transformers
from torchmetrics.classification import MulticlassAccuracy

## Choix des métriques
metrics=[CrossEntropy(), MulticlassAccuracy(num_classes=2, average='micro')]

class PrintMetricsCallback(Callback):
    def eval_end(self, state, logger):
        metrics = state.eval_metrics['eval']
        for metric_name, metric in metrics.items():
            print(f"{metric_name}: {metric.compute()}")

## Définition du modèle
model_composer = HuggingFaceModel(model, use_logits = True, tokenizer = tokenizer, metrics = metrics)

## Ajustement des paramètres
opt = AdamW(params = model_composer.parameters(), lr = 5e-5,weight_decay = 0.01, betas = (0.0, 0.99))

## Fine-tuning du modèle
trainer = Trainer(model = model_composer,
                  train_dataloader = train_loader,
                  eval_dataloader = test_loader,
                  max_duration = '2ep',
                  device='gpu' if torch.cuda.is_available() else 'cpu',
                  train_subset_num_batches=150,
                  callbacks=[LRMonitor()],
                  loggers = WandBLogger(project = "poem_gen_ft"),
                  seed = 20)

trainer.fit()



VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
time/batch,▁
time/batch_in_epoch,▁
time/epoch,▁
time/sample,▁
time/sample_in_epoch,▁
time/token,▁
time/token_in_epoch,▁

0,1
time/batch,0
time/batch_in_epoch,0
time/epoch,0
time/sample,0
time/sample_in_epoch,0
time/token,0
time/token_in_epoch,0


******************************
Config:
composer_commit_hash: None
composer_version: 0.23.5
node_name: unknown because NODENAME environment variable not set
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 20

******************************
  self.pid = os.fork()


train          Epoch   0:    0%|| 0/150 [00:00<?, ?ba/s]         

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 

In [None]:
# mlflow.set_tracking_uri("http://127.0.0.1:5000")  # si ngrok : http://127.0.0.1:5000 # si databricks : databricks
# mlflow.set_experiment("Poem Generation")  # si databricks : /Users/emelinecaruana@gmail.com/

# with mlflow.start_run(): #with mlflow.start_run(run_i)
#     trainer.train()
#     mlflow.log_metric("accuracy", trainer.evaluate()["eval_accuracy"])
#     mlflow.pytorch.log_model(model, "model")