# Imports et installation de bibliothèques necéssaires au projet

In [None]:
%pip install accelerate -U
%pip install datasets evaluate transformers[torch] torch torcheval torchmetrics
%pip install comet-ml mlflow databricks-sdk pyngrok --quiet

Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->

In [1]:
import mlflow
from mlflow import MlflowClient

import comet_ml
from transformers.integrations import CometCallback

from pyngrok import ngrok

import numpy as np
import evaluate

import re
import os
import glob
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# imports venant de torch
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LinearLR
from torchmetrics.classification import MulticlassAccuracy


# imports venant de tranformers
from transformers import Trainer
from transformers import TrainingArguments

import transformers
from transformers import get_scheduler
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification


# imports venant de datasets
import datasets
from datasets import load_dataset
from datasets import Dataset, DatasetDict

ModuleNotFoundError: No module named 'mlflow'

In [None]:
# Informations sur les cpu et gpu
from multiprocessing import cpu_count

print(torch.cuda.device_count())      # GPU
print(cpu_count())                    # CPU

In [None]:
## Variables d'environnement pour accéder aux différentes APIs

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
os.environ["COMET_LOG_ASSETS"] = "True"
os.environ["COMET_API_KEY"] = "g9Um8JaLLAjkjVKYPZjYLXvcP"
os.environ["COMET_PROJECT_NAME"] = "poem-generation"

# Connexion aux différents outils de monitoring, etc

In [None]:
## MlFlow via ngrok

# ngrok.kill()
# NGROK_AUTH_TOKEN = "2ixcAblHEmYTRtDyUOxZBO8nR2p_3Zq8P9bXN4wTBwLTuB23A"
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# public_url = ngrok.connect(addr="5000", proto="http", bind_tls=True)
# get_ipython().system_raw("mlflow ui --port 5000 &")
# print("MLflow Tracking UI:", public_url)

# client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
# experiment_desc = "Poem generation"
# experiment_tags = {
#     "team_lead": "Emeline",
#     "department": "dst",
#     "project": "poem_gen",
#     "mlflow.note.content": experiment_desc
# }

# client.create_experiment("Poem Generation Project", tags=experiment_tags)

In [None]:
## Comet ML

comet_ml.login(api_key="g9Um8JaLLAjkjVKYPZjYLXvcP")

# Initialisation des variables pour le modèle

In [None]:
## Récupération du modèle à fine-tune (checkpoint)

checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
datacollator = DataCollatorWithPadding(tokenizer = tokenizer)

# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# checkpoint = "google/flan-t5-base"
# tokenizer = T5Tokenizer.from_pretrained(checkpoint)
# model = T5ForConditionalGeneration.from_pretrained(checkpoint, do_sample=True)


# checkpoint = "gpt2"
# tokenizer = GPT2TokenizerFast.from_pretrained(checkpoint)
# model = GPT2LMHeadModel.from_pretrained(checkpoint)

### Petit test du modèle avant Fine-tuning

In [None]:
## Définir les paramètres de génération
max_length = 128
num_beams = 4
temperature = 0.1

## Définir le thème ou le style du poème
theme = "Can you write a poem about dogs"

## Préparatin de l'input
encoding = tokenizer.encode_plus(theme,
                                 add_special_tokens=True,
                                 max_length=max_length,
                                 padding='max_length',
                                 truncation=True,
                                 return_attention_mask=True,
                                 return_tensors='pt')

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

## Générer le poème
output = model.generate(input_ids,
                        attention_mask=attention_mask,
                        max_length=max_length,
                        num_beams=num_beams,
                        temperature=temperature)

## Afficher le poème généré
print(tokenizer.decode(output[0], skip_special_tokens=True))

# Récupération du dataset

In [None]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

In [None]:
## Récupérations de tous les paths des fichiers puis des topics et des types pour créer un dataframe
types_poems = []
for f in glob.glob(folder_path+"/forms/*"):
  t = re.sub(folder_path+"/forms/",'',f)
  if t not in types_poems :
    types_poems.append(t)
print("Types of poems :",types_poems)
print(len(types_poems))

topics_poems = []
for f in glob.glob(folder_path+"/topics/*"):
  t = re.sub(folder_path+"/topics/",'',f)
  if t not in topics_poems :
    topics_poems.append(t)
print("\nTopics in poems :",topics_poems)
print(len(topics_poems))

files = []
for f in glob.glob(folder_path+"/*/*/*"):
  files.append(f)
print("\nFile names :",files[:10])
print(len(files))

### Récupération des informations des données et création d'un dataframe

In [None]:
## Création d'un dataframe contenant toutes les données avec comme colonnes : path, topic, type, text
list_types, list_topics = [], []

for f in files :
  if "poems_dataset/forms/" in f :
    for typ in types_poems :
      if str("poems_dataset/forms/"+typ) in f :
        list_types.append(typ)
  else :
    list_types.append("no_type")

  if "poems_dataset/topics/" in f :
    for top in topics_poems :
      if str("poems_dataset/topics/"+top) in f :
        list_topics.append(top)
  else :
    list_topics.append("no_topic")

print(len(list_types), len(list_topics))
print(list_types[13803:])

10241 10341
[]


In [None]:
dict_data = {"path" : files, "type" : list_types[:10241], "topic" : list_topics[:10241]}

print(len(dict_data['path']))
print(len(dict_data['topic']))
print(len(dict_data['type']))

df = pd.DataFrame.from_dict(dict_data)

texts = []
for f in tqdm(files) :
  t = open(f, "r")
  txt = t.read()
  texts.append(txt)
df['text'] = texts

10241
10241
10241


  0%|          | 0/10241 [00:00<?, ?it/s]

In [None]:
## Exportation ou importation des données sous forme de fichier .json

# df.to_json(r"/content/drive/MyDrive/POEI/projet/poems_dataset_data_v2.json")
df = pd.read_json(r"/content/drive/MyDrive/POEI/projet/poems_dataset_data_v2.json")

In [None]:
display(df)

Unnamed: 0,path,type,topic,text
0,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My love is pure as honey, made of selective ne..."
1,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,The earth speaks of your discerning and stern ...
2,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"My dreams stood naked, behind the burning desi..."
3,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Spring we started planting, after tilling the ..."
4,/content/drive/MyDrive/POEI/projet/poems_datas...,alexandrine,no_topic,"Saving the environment, saving the nature\nWe ..."
...,...,...,...,...
10236,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
10237,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
10238,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
10239,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Transformation des données en dataset HF

In [1]:
## Encoding des données textuelles 'topic' et 'type' en données numériques
# encoded_df = df

# le = LabelEncoder()
# le.fit(df['type'])
# df_type_encoded = le.transform(df['type'])
# print(df_type_encoded)
# encoded_df['type'] = df_type_encoded

# le.fit(df['topic'])
# df_topic_encoded = le.transform(df['topic'])
# print(df_topic_encoded)
# encoded_df['topic'] = df_topic_encoded

# display(encoded_df.head(5))

NameError: name 'df' is not defined

In [None]:
## Séparation des données en train et test

# On garde uniquement les données nécessaires au FT (topic ou type)
df_topic = df[df.topic != 'no_topic']
df_type = df[df.topic != 'no_type']

train_data, test_data = train_test_split(df_topic, test_size=0.2)

tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds.remove_columns(["__index_level_0__"])
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['path', 'type', 'topic', 'text'],
        num_rows: 5976
    })
    test: Dataset({
        features: ['path', 'type', 'topic', 'text'],
        num_rows: 1494
    })
})


### Préparation des données pour les utiliser dans le fine-tuning

In [None]:
print(tokenizer.bos_token)

None


In [None]:
def tokenize_data(examples):
  if tokenizer.bos_token is None:
    bos_token = str(tokenizer.vocab_size + 1)
    tokenizer.add_special_tokens({'bos_token': bos_token})
    tokenizer.bos_token = bos_token

  inputs = tokenizer(examples['text'], padding=True, truncation=True)
  targets = tokenizer(examples['topic'], padding=True, truncation=True)

  decoder_input_ids = []
  for target in targets['input_ids']:
    decoder_input_ids.append([tokenizer.bos_token_id] + target)

  return {'input_ids': inputs['input_ids'],
          'attention_mask': inputs['attention_mask'],
          'decoder_input_ids': decoder_input_ids,
          'labels': targets['input_ids']}


train_data = dataset['train'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])
test_data = dataset['test'].map(tokenize_data, batched=True, remove_columns=['text', 'topic'])


print(train_data)
print(train_data[10])

Map:   0%|          | 0/5976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1494 [00:00<?, ? examples/s]

Dataset({
    features: ['path', 'type', 'input_ids', 'attention_mask', 'decoder_input_ids', 'labels'],
    num_rows: 5976
})
{'path': '/content/drive/MyDrive/POEI/projet/poems_dataset/topics/hope/HopePoemsLamentationsOfJeremiahIiiHopeOfReliefThroughGodsMercyPoembyProphetJeremiah.txt', 'type': 'no_type', 'input_ids': [209, 27, 183, 8, 388, 24, 3, 547, 107, 894, 3, 4127, 2176, 1575, 57, 8, 6102, 13, 112, 3, 210, 1795, 107, 5, 204, 216, 3, 547, 107, 2237, 140, 6, 11, 1940, 140, 139, 14882, 6, 68, 59, 139, 659, 5, 220, 3, 28186, 581, 140, 19, 3, 88, 2120, 117, 3, 88, 919, 15, 189, 112, 609, 581, 140, 66, 8, 239, 5, 314, 499, 15634, 11, 82, 1133, 3, 547, 107, 3, 88, 263, 625, 117, 3, 88, 3, 547, 107, 4335, 82, 12432, 5, 305, 216, 3, 547, 107, 918, 15, 26, 581, 140, 6, 11, 2890, 3974, 26, 140, 28, 12486, 11, 2954, 5, 431, 216, 3, 547, 107, 356, 140, 16, 2164, 1747, 6, 38, 79, 24, 36, 3654, 13, 625, 5, 489, 216, 3, 547, 107, 18179, 26, 140, 81, 6, 24, 27, 1178, 129, 91, 10, 3, 88, 3, 547, 10

In [None]:
df_train = pd.DataFrame(train_data.to_dict())
display(df_train.head(5))

Unnamed: 0,path,type,input_ids,attention_mask,decoder_input_ids,labels
0,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,"[314, 1808, 26818, 44, 5190, 318, 159, 793, 31...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 706, 1, 0]","[706, 1, 0]"
1,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,"[328, 228, 470, 734, 125, 3, 76, 356, 91, 204,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 2324, 1, 0]","[2324, 1, 0]"
2,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,"[9438, 28, 140, 2321, 82, 609, 3197, 140, 885,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 2595, 1, 0]","[2595, 1, 0]"
3,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,"[37, 1969, 65, 3, 7483, 182, 23147, 275, 34, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 5796, 1, 0]","[5796, 1, 0]"
4,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,"[19451, 4262, 9083, 55, 3645, 3, 10770, 1655, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[32100, 4033, 1, 0]","[4033, 1, 0]"


In [None]:
## Affichage de quelques poèmes et leur tokenisation

print(ds['train'][10]["text"])
print(len(train_data[10]["input_ids"]))
print(train_data[10]["input_ids"])

print("\n",ds['train'][15]["text"])
print(len(train_data[15]["input_ids"]))
print(train_data[15]["input_ids"])

All creatures deep inside are quite aware
that it's the heart and, with it, its sweet beat
only its never-ending flutter lets us share
a bit of time upon this planet, where we meet
so many creatures and, above all real humans
who all do strive to stay as long as they're allowed
they fiddle with their health and look at cardiac lumens
and hope the gods remember what the sheep have vowed.
There is a little thing that's often mentioned
it does concern our interpersonal communication
no matter, highly motivated or so well-intentioned
it's what may make us  give enough consideration
to one small word, and that is, vaguely, called respect.
Respect for the law
and for Ma and Pa,
for the crooked Police
for Canadian geese,
for the Judges and Preachers
and for needlework teachers
for the butchers and bakers
and the skilled coffinmakers,
for the nurses and bitches
and burglars and snitches
for all druggies and whores
and the owners of stores
thus, the list could continue
to a different venue
but 

# Fine-tuning du modèle

### Fine-tuning

In [None]:
## Trainer mais de la lib Transformers
%pip install comet-ml
import comet_ml
from transformers.integrations import CometCallback

metrics = evaluate.load("accuracy")

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to=["comet_ml"]
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=metrics,
    callbacks=[CometCallback()]
)

# ## Define a custom forward function with gradient checkpointing
# def custom_forward(model, inputs):
#     def custom_forward_fn(*inputs):
#         inputs = {k: v for k, v in inputs.items()}
#         outputs = model(**inputs, output_hidden_states=True)
#         return outputs.last_hidden_state

#     return checkpoint(custom_forward_fn, *inputs)

# ## Update the model's forward function to use gradient checkpointing
# model.forward = custom_forward

trainer.train()



You are adding a <class 'transformers.integrations.integration_utils.CometCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
CometCallback
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/emeline-caruana/poem-generation/ab4929c42b3a4c23b7618019cabe3000

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : energetic_rasp_9062
[1;38;5;39mCOMET

OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 

In [None]:
%pip install nvidia-ml-py3

import nvidia
!nvidia-smi

Wed Jul 17 13:27:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   46C    P8              17W /  72W |      4MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# mlflow.set_tracking_uri("http://127.0.0.1:5000")  # si ngrok : http://127.0.0.1:5000 # si databricks : databricks
# mlflow.set_experiment("Poem Generation")  # si databricks : /Users/emelinecaruana@gmail.com/

# with mlflow.start_run(): #with mlflow.start_run(run_i)
#     trainer.train()
#     mlflow.log_metric("accuracy", trainer.evaluate()["eval_accuracy"])
#     mlflow.pytorch.log_model(model, "model")

In [None]:
# ## Trainer mais de la lib MosaicML
# from composer import Trainer

# # model.resize_token_embeddings(len(tokenizer))

# model_composer = HuggingFaceModel(model, use_logits = True, tokenizer = tokenizer, metrics = metrics_entropy)


# ## Ajustement des paramètres
# opt = AdamW(params = model_composer.parameters(), lr = 5e-5,weight_decay = 0.01, betas = (0.0, 0.99))

# ## Fine-tuning du modèle
# trainer = Trainer(model = model_composer,
#                   train_dataloader = train_loader,
#                   eval_dataloader = eval_loader,
#                   max_duration = '2ep')

# trainer.fit()