# Imports et installation de bibliothèques necéssaires au projet

In [1]:
%pip install accelerate -U
%pip install datasets evaluate transformers[torch] torch torcheval torchmetrics rouge_score
%pip install mlflow databricks-sdk pyngrok --quiet
%pip install comet-ml



In [2]:
## imports pour le suivi d'expériences
from pyngrok import ngrok
from mlflow import MlflowClient

import comet_ml
from transformers.integrations import CometCallback

## imports
import re
import os
import glob
import evaluate
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import train_test_split


## imports venant de torch
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LinearLR
from torchmetrics.classification import MulticlassAccuracy


## imports venant de tranformers
import transformers
from transformers import get_scheduler, Trainer, TrainingArguments
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq


## imports venant de datasets
import datasets
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict

In [3]:
# Informations sur les cpu et gpu
from multiprocessing import cpu_count

print(torch.cuda.device_count())      # GPU
print(cpu_count())                    # CPU

1
12


In [5]:
## Variables d'environnement pour accéder aux différentes APIs
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

os.environ["COMET_LOG_ASSETS"] = "True"
os.environ["COMET_API_KEY"] = "g9Um8JaLLAjkjVKYPZjYLXvcP"
os.environ["COMET_PROJECT_NAME"] = "poem-generation"

os.environ['COMET_GITLAB_URL'] = "https://gitlab.com/emeline-caruana"
os.environ['COMET_GITLAB_TOKEN'] = "glpat-_9gZQ2586KsFr67vbEjp"
os.environ['COMET_GITLAB_PROJECT_ID'] = "60538231"

# Connexion aux différents outils de monitoring, etc

In [6]:
## Comet ML
comet_ml.login(api_key="g9Um8JaLLAjkjVKYPZjYLXvcP")

[1;38;5;39mCOMET INFO:[0m Valid Comet API Key saved in /content/drive/MyDrive/.comet.config (set COMET_CONFIG to change where it is saved).


# Initialisation des variables pour le modèle

In [7]:
## Récupération du modèle à fine-tune (checkpoint)

checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
datacollator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# model = T5ForConditionalGeneration.from_pretrained(checkpoint, do_sample=True)
# datacollator = DataCollatorWithPadding(tokenizer = tokenizer)

# checkpoint = "google/flan-t5-base"

# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# checkpoint = "gpt2"
# tokenizer = GPT2TokenizerFast.from_pretrained(checkpoint)
# model = GPT2LMHeadModel.from_pretrained(checkpoint)

### Petit test du modèle avant Fine-tuning

In [8]:
## Définir les paramètres de génération
max_length = 128
num_beams = 4
temperature = 0.1

## Définir le thème ou le style du poème
theme = "Can you write a poem about dogs"

## Préparatin de l'input
encoding = tokenizer.encode_plus(theme,
                                 add_special_tokens=True,
                                 max_length=max_length,
                                 padding='max_length',
                                 truncation=True,
                                 return_attention_mask=True,
                                 return_tensors='pt')

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

## Générer le poème
output = model.generate(input_ids,
                        attention_mask=attention_mask,
                        max_length=max_length,
                        num_beams=num_beams,
                        temperature=temperature)

## Afficher le poème généré
print(tokenizer.decode(output[0], skip_special_tokens=True))



i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love dogs i love


# Récupération du dataset

In [9]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
## Récupérations de tous les paths des fichiers puis des topics et des types pour créer un dataframe
types_poems = []
for f in glob.glob(folder_path+"/forms/*"):
  t = re.sub(folder_path+"/forms/",'',f)
  if t not in types_poems :
    types_poems.append(t)
print("Types of poems :",types_poems)
print(len(types_poems))

topics_poems = []
for f in glob.glob(folder_path+"/topics/*"):
  t = re.sub(folder_path+"/topics/",'',f)
  if t not in topics_poems :
    topics_poems.append(t)
print("\nTopics in poems :",topics_poems)
print(len(topics_poems))

files = []
for f in glob.glob(folder_path+"/*/*/*"):
  files.append(f)
print("\nFile names :",files[:10])
print(len(files))

### Récupération des informations des données et création d'un dataframe

In [None]:
## Création d'un dataframe contenant toutes les données avec comme colonnes : path, topic, type, text
list_types, list_topics = [], []

for f in files :
  if "poems_dataset/forms/" in f :
    for typ in types_poems :
      if str("poems_dataset/forms/"+typ) in f :
        list_types.append(typ)
  else :
    list_types.append("no_type")

  if "poems_dataset/topics/" in f :
    for top in topics_poems :
      if str("poems_dataset/topics/"+top) in f :
        list_topics.append(top)
  else :
    list_topics.append("no_topic")

print(len(list_types), len(list_topics))
print(list_types[13803:])

10241 10341
[]


In [None]:
dict_data = {"path" : files, "type" : list_types[:10241], "topic" : list_topics[:10241]}

print(len(dict_data['path']))
print(len(dict_data['topic']))
print(len(dict_data['type']))

df = pd.DataFrame.from_dict(dict_data)

texts = []
for f in tqdm(files) :
  t = open(f, "r")
  txt = t.read()
  texts.append(txt)
df['text'] = texts

10241
10241
10241


  0%|          | 0/10241 [00:00<?, ?it/s]

In [10]:
## Exportation ou importation des données sous forme de fichier .json

# df.to_json(r"/content/drive/MyDrive/projet/poems_dataset_data_v2.json")
df = pd.read_json(r"/content/drive/MyDrive/projet/poems_dataset_data_v2.json")

In [11]:
display(df)

Unnamed: 0,path,type,topic,text
0,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My love is pure as honey, made of selective ne..."
1,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,The earth speaks of your discerning and stern ...
2,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My dreams stood naked, behind the burning desi..."
3,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Spring we started planting, after tilling the ..."
4,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Saving the environment, saving the nature\nWe ..."
...,...,...,...,...
10236,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
10237,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
10238,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
10239,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Transformation des données en dataset HF

In [12]:
## Choix de la caractériqtique pour le fine-tuning à faire : topic ou type, à modifier en fonction du choix

## On garde uniquement les données nécessaires au FT (topic ou type)
df_topic = df[df.topic != 'no_topic']
# display(df_topic)

# df_type = df[df.topic != 'no_type']


## Supression de la colonnes avec la caractéristique non utilisées pour le FT
df_topic = df_topic.drop('type', axis=1)
# display(df_topic)

# df_type = df_type.drop('topic', axis=1)
# display(df_type)


## Supression de la colonne "path" afin de ne plus avoir de valeurs str
## Mais création d'un autre dataframe avec les ids des données pour avoir les paths si besoin
df_topic.reset_index(inplace=True)
df_topic.rename(columns={'index': 'id'}, inplace=True)

paths = df_topic["path"].to_frame()
paths.reset_index(inplace=True)
paths.rename(columns={'index': 'id'}, inplace=True)
paths["id"] = df_topic["id"]

df_topic = df_topic.drop('path', axis=1)
display(df_topic)

## Séparation des données en train et test
train_data, test_data = train_test_split(df_topic, test_size=0.2)

## Transformation en HF Datasets
tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds.remove_columns(["__index_level_0__"])

Unnamed: 0,id,topic,text
0,2771,funny,"There once was a sister,\nwho loved to kiss he..."
1,2772,funny,▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒ ▒\nScratch here to r...
2,2773,funny,I find it kinda funny\nI find it kinda sad\nTh...
3,2774,funny,A server was a man with drinks\nA Notebook was...
4,2775,funny,"isn't it funny,\nhow the world likes to be?\ni..."
...,...,...,...
7465,10236,summer,"Sleep has not visited me the whole night,\nWil..."
7466,10237,summer,"Love-cradling Night, lit by the lucent moon,\n..."
7467,10238,summer,Bells overbrim with sound\nAnd spread from cup...
7468,10239,summer,Come Sleep; O Sleep! the certain knot of peace...


### Préparation des données pour les utiliser dans le fine-tuning

In [13]:
def tokenize_data(examples):
    inputs = tokenizer(examples['text'], padding=True, truncation=True)
    targets = tokenizer(examples['topic'], padding=True, truncation=True)

    return {'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'labels': targets['input_ids']}


train_data = dataset['train'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])
test_data = dataset['test'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])


print(train_data)
print(train_data[10])

Map:   0%|          | 0/5976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1494 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5976
})
{'id': 5680, 'input_ids': [71, 286, 57, 8, 1472, 16, 82, 625, 2939, 3533, 6, 27, 183, 2238, 28, 150, 80, 82, 1472, 12, 698, 5, 37, 5261, 697, 13, 4303, 7, 9706, 2756, 6, 7762, 32, 697, 140, 16, 82, 62, 1208, 3, 102, 2242, 5, 18136, 7, 1556, 30, 5020, 11, 1481, 6, 17644, 7, 657, 11, 915, 1107, 12, 580, 5, 17857, 8, 2943, 149, 40, 7, 11, 2288, 3247, 5, 419, 14481, 53, 140, 13, 82, 3, 12076, 12432, 5, 27, 183, 16, 8, 7411, 13, 140, 1618, 3995, 120, 1659, 6, 1142, 657, 14380, 3249, 27, 228, 59, 453, 5, 71, 9111, 53, 15937, 13, 8800, 307, 657, 6, 438, 8, 20747, 31, 7, 13, 1234, 11518, 396, 1006, 5, 71, 14788, 16, 8, 2034, 12, 659, 8, 194, 6, 304, 136, 1513, 3668, 24, 2746, 12, 1049, 5, 432, 33, 2222, 12, 3, 9, 286, 57, 82, 1472, 6, 100, 19, 82, 1663, 82, 80, 3667, 5, 301, 782, 6972, 7, 19, 125, 27, 2971, 167, 6, 27, 183, 4403, 12, 577, 8, 12084, 2290, 5, 4589, 590, 10995, 57, 82, 14788, 31, 7, 659

In [14]:
df_train = pd.DataFrame(train_data.to_dict())
display(df_train.head(5))

Unnamed: 0,id,input_ids,attention_mask,labels
0,10158,"[3, 4, 15086, 2686, 18, 4059, 935, 13190, 6, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1248, 1, 0]"
1,5861,"[275, 2887, 3, 9, 418, 2035, 8602, 7, 6, 18003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 10008, 1]"
2,9665,"[3, 9, 460, 424, 2662, 2837, 15, 7, 15, 9360, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2049, 1, 0]"
3,8075,"[333, 25, 6, 11, 278, 2, 17, 214, 149, 12, 333...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2987, 1, 0]"
4,10058,"[571, 6080, 11, 3902, 438, 20, 210, 15946, 7, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[544, 1, 0]"


In [15]:
## Affichage d'un poème et sa tokenisation
print(ds['train'][10]["text"])
print(train_data[10]["input_ids"])

A place by the fire in my old arm chair,
I am alone with no one my fire to share.
The crackling of logs burning bright,
Consoling me in my weary plight.
Shadows playing on ceiling and wall,
Ghosts past and present coming to call.
Outside the wind howls and moans.
Reminding me of my aching bones.
I am in the grip of melancholy deep,
Just past rendezvous I could not keep.
A fleeting glimpse of happiness long past,
With the echo's of words spoken too fast.
A candle in the window to light the way,
To any lost soul that wants to stay.
All are welcome to a place by my fire,
This is my wish my one desire.
Loneliness is what I fear most,
I am willing to play the welcoming host.
Come along guided by my candle's light,
And help me pass this lonely night
[71, 286, 57, 8, 1472, 16, 82, 625, 2939, 3533, 6, 27, 183, 2238, 28, 150, 80, 82, 1472, 12, 698, 5, 37, 5261, 697, 13, 4303, 7, 9706, 2756, 6, 7762, 32, 697, 140, 16, 82, 62, 1208, 3, 102, 2242, 5, 18136, 7, 1556, 30, 5020, 11, 1481, 6, 17644, 7

# Fine-tuning du modèle

### Fine-tuning

In [19]:
## Choix des métriques
accuracy = evaluate.load("accuracy")
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [24]:
## Trainer mais de la lib Transformers
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="accuracy", #"rouge1",
    greater_is_better=True,
    report_to=["comet_ml"],
    predict_with_generate=True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=datacollator,
    compute_metrics=accuracy, #compute_metrics,
    callbacks=[CometCallback()]
)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_data,
#     eval_dataset=test_data,
#     compute_metrics=metrics,
#     callbacks=[CometCallback()]
# )

You are adding a <class 'transformers.integrations.integration_utils.CometCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
CometCallback


In [25]:
trainer.train()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : tomato_chimpanzee_8729
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/emeline-caruana/poem-generation/dc7d1dbd1c6d49b8a4a320c4e8da7a1b
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     loss : nan
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     args/_n_gpu                                  : 1
[1;38;5;39mCOMET INFO:[0m     args/_no_sync_in_gradient_accumulation       : True
[1;38;5;39mCOMET INFO:[0m     args/_setup_devices                          : cuda:0
[1;38;5;39mCOMET INF

Epoch,Training Loss,Validation Loss




TypeError: 'Accuracy' object is not callable

In [None]:
experiment.end()

In [None]:
%pip install nvidia-ml-py3

import nvidia
!nvidia-smi

Wed Jul 17 13:27:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   46C    P8              17W /  72W |      4MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# mlflow.set_tracking_uri("http://127.0.0.1:5000")  # si ngrok : http://127.0.0.1:5000 # si databricks : databricks
# mlflow.set_experiment("Poem Generation")  # si databricks : /Users/emelinecaruana@gmail.com/

# with mlflow.start_run(): #with mlflow.start_run(run_i)
#     trainer.train()
#     mlflow.log_metric("accuracy", trainer.evaluate()["eval_accuracy"])
#     mlflow.pytorch.log_model(model, "model")