# Imports et installation de bibliothèques necéssaires au projet

In [1]:
%pip install accelerate -U
%pip install datasets evaluate transformers[torch] torch torcheval torchmetrics rouge_score
%pip install comet-ml



In [2]:
## imports pour le suivi d'expériences
import comet_ml
from comet_ml import API
from comet_ml import Experiment

## imports
import re
import os
import glob
import evaluate
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import train_test_split


## imports venant de torch
import torch
from torch.optim import AdamW


## imports venant de tranformers
import transformers
from transformers import pipeline, GenerationConfig
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import get_scheduler, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding, DataCollatorForSeq2Seq


## imports venant de datasets
import datasets
from datasets import load_metric
from datasets import Dataset, DatasetDict

In [3]:
## Informations sur les cpu et gpu
from multiprocessing import cpu_count

print(torch.cuda.device_count())      # GPU
print(cpu_count())                    # CPU

1
12


In [4]:
## Variables d'environnement pour accéder aux différentes APIs
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

os.environ["COMET_LOG_ASSETS"] = "True"
os.environ["COMET_API_KEY"] = "g9Um8JaLLAjkjVKYPZjYLXvcP"
os.environ["COMET_PROJECT_NAME"] = "poem-gen-ft-v2-2"

os.environ['COMET_GITLAB_URL'] = "https://gitlab.com/emeline-caruana"
os.environ['COMET_GITLAB_TOKEN'] = "glpat-_9gZQ2586KsFr67vbEjp"
os.environ['COMET_GITLAB_PROJECT_ID'] = "60538231"

# Connexion aux différents outils de monitoring, etc

In [5]:
## Comet ML
comet_ml.login(api_key="g9Um8JaLLAjkjVKYPZjYLXvcP")

[1;38;5;39mCOMET INFO:[0m Valid Comet API Key saved in /content/drive/MyDrive/.comet.config (set COMET_CONFIG to change where it is saved).


# Initialisation des variables pour le modèle

In [7]:
## Récupération du modèle à fine-tune (checkpoint)
# checkpoint = "google/flan-t5-base"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = T5ForConditionalGeneration.from_pretrained(checkpoint, do_sample=True)

model = T5ForConditionalGeneration.from_pretrained("/tmp/tmpegx4cr_p/t5-finetuned")
tokenizer = T5Tokenizer.from_pretrained("/tmp/tmpegx4cr_p/t5-finetuned")
print(model)


# checkpoint = "gpt2"
# tokenizer = GPT2TokenizerFast.from_pretrained(checkpoint)
# model = GPT2LMHeadModel.from_pretrained(checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

### Petit test du modèle avant Fine-tuning

In [8]:
test_pip = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

result = test_pip("Write a haiku about dogs", max_new_tokens=120)
print("\nPoème généré :\n", result[0]['generated_text'])


Poème généré :
 In winter the spetticoats wilt: The whites of my lips have writ, and the fanned horn gleams in the light. A hive at once nigh, It is no harp, nor a whip, For the hath the twitching cocky face, Nor in the twigs that wear, Let light the horns blow; Some of these are humming the sound of a song, As they sing the gentle breeze, They can


# Récupération du dataset

In [9]:
## Récupération des fichiers via le drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/projet/poems_dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##### Récupération des paths, "metadas" pour tout mettre dans un dataframe

In [None]:
## Récupérations de tous les paths des fichiers puis des topics et des types pour créer un dataframe
types_poems = []
for f in glob.glob(folder_path+"/forms/*"):
  t = re.sub(folder_path+"/forms/",'',f)
  if t not in types_poems :
    types_poems.append(t)
print("Types of poems :",types_poems)
print(len(types_poems))

topics_poems = []
for f in glob.glob(folder_path+"/topics/*"):
  t = re.sub(folder_path+"/topics/",'',f)
  if t not in topics_poems :
    topics_poems.append(t)
print("\nTopics in poems :",topics_poems)
print(len(topics_poems))

files = []
for f in glob.glob(folder_path+"/*/*/*"):
  files.append(f)
print("\nFile names :",files[:10])
print(len(files))

Types of poems : []
0

Topics in poems : []
0

File names : []
0


##### Récupération des informations des données et création d'un dataframe

In [None]:
## Création d'un dataframe contenant toutes les données avec comme colonnes : path, topic, type, text
list_types, list_topics = [], []

for f in files :
  if "poems_dataset/forms/" in f :
    for typ in types_poems :
      if str("poems_dataset/forms/"+typ) in f :
        list_types.append(typ)
  else :
    list_types.append("no_type")

  if "poems_dataset/topics/" in f :
    for top in topics_poems :
      if str("poems_dataset/topics/"+top) in f :
        list_topics.append(top)
  else :
    list_topics.append("no_topic")

print(len(list_types), len(list_topics))
print(list_types[13803:])

0 0
[]


In [None]:
dict_data = {"path" : files, "type" : list_types[:10241], "topic" : list_topics[:10241]}

print(len(dict_data['path']))
print(len(dict_data['topic']))
print(len(dict_data['type']))

df = pd.DataFrame.from_dict(dict_data)

texts = []
for f in tqdm(files) :
  t = open(f, "r")
  txt = t.read()
  texts.append(txt)
df['text'] = texts

0
0
0


0it [00:00, ?it/s]

##### Enregistrement ou récupération du dataframe sous forme .json

In [10]:
## Exportation ou importation des données sous forme de fichier .json
# df.to_json(r"/content/drive/MyDrive/projet/poems_dataset_data_v2.json")
df = pd.read_json(r"/content/drive/MyDrive/projet/poems_dataset_data.json")

In [11]:
display(df)

Unnamed: 0,path,type,topic,text
0,/content/drive/MyDrive/POEI/projet/poems_datas...,abecedarian,no_topic,Always Be Chaste\nDesire Encourages Fornicatio...
1,/content/drive/MyDrive/POEI/projet/poems_datas...,abecedarian,no_topic,Precambrian Era (4600 to 542.0 million years a...
2,/content/drive/MyDrive/POEI/projet/poems_datas...,abecedarian,no_topic,Angry at you because you did not really pay at...
3,/content/drive/MyDrive/POEI/projet/poems_datas...,abecedarian,no_topic,"Introspecting Life - Abecedarian\nDecember 4, ..."
4,/content/drive/MyDrive/POEI/projet/poems_datas...,abecedarian,no_topic,Aye I call you a pig and hog! Don’t worry same...
...,...,...,...,...
13798,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
13799,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
13800,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
13801,/content/drive/MyDrive/POEI/projet/poems_datas...,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Transformation des données en dataset HF

In [12]:
## Supression de la colonne "path" afin de ne plus avoir de valeurs str
## Mais création d'un autre dataframe avec les ids des données pour avoir les paths si besoin
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)

paths = df["path"].to_frame()
paths.reset_index(inplace=True)
paths.rename(columns={'index': 'id'}, inplace=True)
paths["id"] = df["id"]

df = df.drop('path', axis=1)
display(df)

## Séparation des données en train et test
train_data, test_data = train_test_split(df, test_size=0.2)

## Transformation en HF Datasets
tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

dataset = ds.remove_columns(["__index_level_0__"])

Unnamed: 0,id,type,topic,text
0,0,abecedarian,no_topic,Always Be Chaste\nDesire Encourages Fornicatio...
1,1,abecedarian,no_topic,Precambrian Era (4600 to 542.0 million years a...
2,2,abecedarian,no_topic,Angry at you because you did not really pay at...
3,3,abecedarian,no_topic,"Introspecting Life - Abecedarian\nDecember 4, ..."
4,4,abecedarian,no_topic,Aye I call you a pig and hog! Don’t worry same...
...,...,...,...,...
13798,13798,no_type,summer,"Sleep has not visited me the whole night,\nWil..."
13799,13799,no_type,summer,"Love-cradling Night, lit by the lucent moon,\n..."
13800,13800,no_type,summer,Bells overbrim with sound\nAnd spread from cup...
13801,13801,no_type,summer,Come Sleep; O Sleep! the certain knot of peace...


### Préparation des données pour les utiliser dans le fine-tuning

In [13]:
## Tokenisation des données : transformation des textes en liste d'id de mots
## labels : poèmes
## inputs : topic ou type du poème

def tokenize_data(examples):
    if examples['type'] is None or examples['text'] is None:
        return {'input_ids': [], 'attention_mask': [], 'targets': []}

    inputs = tokenizer(examples['type'], padding=True, max_length=128, truncation=True)
    targets = tokenizer(examples['text'], padding=True, max_length=256, truncation=True)

    return {
        'input_ids': inputs.get('input_ids', []),
        'attention_mask': inputs.get('attention_mask', []),
        'labels': targets.get('input_ids', [])
    }


train_data = dataset['train'].map(tokenize_data, batched=True, remove_columns=['id','text', 'topic', 'type'])
test_data = dataset['test'].map(tokenize_data, batched=True, remove_columns=['id', 'text', 'topic', 'type'])


print(train_data)
print(train_data[10])

Map:   0%|          | 0/11042 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11042
})
{'input_ids': [150, 834, 6137, 1, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0], 'labels': [96, 279, 3216, 6, 6019, 6, 3, 17, 9492, 2265, 2943, 535, 71, 1343, 45, 270, 6, 275, 27, 1522, 22428, 3, 189, 63, 5792, 6522, 6404, 3, 9, 11482, 5, 27, 103, 59, 333, 3, 189, 63, 2983, 11, 3, 7, 109, 15, 17, 955, 3, 17686, 14428, 117, 366, 27, 398, 4418, 42, 7334, 12, 1978, 499, 19275, 12, 15, 7, 5, 242, 572, 225, 27, 36, 1095, 42, 262, 31, 35, 36, 3, 935, 651, 6, 86, 1969, 163, 9695, 21, 6176, 42, 1276, 1208, 5, 499, 2053, 33, 1131, 6, 82, 7524, 33, 1692, 499, 11581, 2515, 3481, 720, 17, 31, 29, 117, 5791, 63, 3, 29, 5937, 53, 17387, 103, 189, 3, 15, 31, 35, 4285, 3, 11889, 32, 31, 82, 3, 20348, 5, 27, 183, 2107, 6, 150, 1052, 149, 27, 1978, 955, 10366, 15, 140, 117, 411, 4272, 6, 2123, 1207, 26, 7, 43, 3, 7, 425, 27, 6899, 9, 532, 8, 15, 55, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
df_train = pd.DataFrame(train_data.to_dict())
display(df_train.head(5))

Unnamed: 0,input_ids,attention_mask,labels
0,"[150, 834, 6137, 1, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 0, 0, 0, 0, 0]","[12574, 30, 8, 1442, 5956, 6, 275, 3412, 30, 8..."
1,"[150, 834, 6137, 1, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 0, 0, 0, 0, 0]","[465, 2102, 4054, 131, 149, 27, 473, 6, 955, 1..."
2,"[5839, 17, 1, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 0, 0, 0, 0, 0, 0]","[5620, 15, 26, 57, 46, 5087, 31, 7, 25039, 655..."
3,"[150, 834, 6137, 1, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 0, 0, 0, 0, 0]","[3, 11944, 9710, 6, 9023, 318, 232, 2042, 55, ..."
4,"[19033, 1607, 1, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 0, 0, 0, 0, 0, 0]","[947, 31, 7, 3, 9, 15708, 5413, 26, 109, 21, 2..."


In [15]:
## Affichage d'un poème et sa tokenisation
print(ds['train'][10]["text"])
print(train_data[10]["labels"])

"Blow, blow, thou winter wind."
Away from here,
And I shall greet thy passing breath
Without a tear.
I do not love thy snow and sleet
Or icy flows;
When I must jump or stamp to warm
My freezing toes.
For why should I be happy or
E'en be merry,
In weather only fitted for
Cook or Peary.
My eyes are red, my lips are blue
My ears frost bitt'n;
Thy numbing kiss doth e'en extend
Thro' my mitten.
I am cold, no matter how I warm
Or clothe me;
O Winter, greater bards have sung
I loathe thee!
[96, 279, 3216, 6, 6019, 6, 3, 17, 9492, 2265, 2943, 535, 71, 1343, 45, 270, 6, 275, 27, 1522, 22428, 3, 189, 63, 5792, 6522, 6404, 3, 9, 11482, 5, 27, 103, 59, 333, 3, 189, 63, 2983, 11, 3, 7, 109, 15, 17, 955, 3, 17686, 14428, 117, 366, 27, 398, 4418, 42, 7334, 12, 1978, 499, 19275, 12, 15, 7, 5, 242, 572, 225, 27, 36, 1095, 42, 262, 31, 35, 36, 3, 935, 651, 6, 86, 1969, 163, 9695, 21, 6176, 42, 1276, 1208, 5, 499, 2053, 33, 1131, 6, 82, 7524, 33, 1692, 499, 11581, 2515, 3481, 720, 17, 31, 29, 117, 5791, 

In [16]:
decoded_text_topic = tokenizer.decode(train_data[10]["input_ids"], skip_special_tokens=True)
decoded_text = tokenizer.decode(train_data[10]["labels"], skip_special_tokens=True)
print(decoded_text_topic)
print(decoded_text)

no_type
"Blow, blow, thou winter wind." Away from here, And I shall greet thy passing breath Without a tear. I do not love thy snow and sleet Or icy flows; When I must jump or stamp to warm My freezing toes. For why should I be happy or E'en be merry, In weather only fitted for Cook or Peary. My eyes are red, my lips are blue My ears frost bitt'n; Thy numbing kiss doth e'en extend Thro' my mitten. I am cold, no matter how I warm Or clothe me; O Winter, greater bards have sung I loathe thee!


# Fine-tuning du modèle

### Fine-tuning

In [17]:
## Data collator
datacollator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## Optimisation
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

num_training_steps = len(train_data) * 25 #25 = num_train_epoch des training arguments
scheduler = get_scheduler(name="cosine", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)


## Choix des métriques
accuracy = evaluate.load("accuracy")
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def clean_up_special_tokens(decoded_texts):
    return [re.sub(r'<[^>]+>', '', text) for text in decoded_texts]


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [18]:
## Trainer mais de la lib Transformers
from transformers import GenerationConfig

# Create a GenerationConfig object
gen_config = GenerationConfig(
    do_sample=True,
    max_length=50,
    num_beams=5,
    temperature=0.7
)

training_args = Seq2SeqTrainingArguments(
    output_dir='./results/comet_ft/ft/run3',
    num_train_epochs=25,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    warmup_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    report_to=["comet_ml"],
    predict_with_generate=True,
    fp16=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=datacollator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [19]:
trainer.train()

# Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file
# (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead.
# This warning will be raised to an exception in v4.41.
# Non-default generation parameters: {'do_sample': True}
# There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/emeline-caruana/poem-gen-ft-v2-2/930ba5daac53481e9b157538b6620b62

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,2.5128,2.41964,0.0709,0.0034,0.0558,0.0558,18.9073
2,2.4809,2.414553,0.0709,0.0033,0.0569,0.0569,18.9268
4,2.4536,2.411908,0.0712,0.0033,0.0569,0.0568,18.9402
6,2.463,2.408804,0.0721,0.0037,0.0571,0.0571,18.9265
8,2.4498,2.406216,0.072,0.0035,0.0575,0.0575,18.9529
10,2.4518,2.403694,0.0707,0.0033,0.0566,0.0567,18.9124
12,2.4248,2.402289,0.0714,0.0037,0.0567,0.0567,18.9044
14,2.424,2.400268,0.0714,0.0034,0.057,0.057,18.9221
16,2.4359,2.397915,0.071,0.0032,0.0566,0.0566,18.8957
18,2.4138,2.395446,0.0728,0.0037,0.0575,0.0574,18.8913


Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-default generation parameters: {'do_sample': True}
Non-defaul

TrainOutput(global_step=17250, training_loss=2.4371810745018117, metrics={'train_runtime': 19650.7539, 'train_samples_per_second': 14.048, 'train_steps_per_second': 0.878, 'total_flos': 5518866078056448.0, 'train_loss': 2.4371810745018117, 'epoch': 24.98189717595945})

### Enregistrement du modèle dans Comet

In [20]:
# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("/content/results/comet_ft/ft/run3/checkpoint-5524")
tokenizer = T5Tokenizer.from_pretrained("/content/results/comet_ft/ft/run3/checkpoint-5524")

# Create a Comet experiment
experiment = Experiment(api_key="g9Um8JaLLAjkjVKYPZjYLXvcP", project_name="poem-gen-ft-v2-2")

# Create a directory to save the model
model_dir = "t5-finetuned"
os.makedirs(model_dir, exist_ok=True)

# Save the model and tokenizer to the directory
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# Log the model to Comet
experiment.log_model("t5-finetuned", model_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/emeline-caruana/poem-gen-ft-v2-2/fc2d3caec86243ef90f288667ec4c377

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
Non-default generation parameters: {'do_sample': True}


[('generation_config.json',
  {'web': 'https://www.comet.com/api/asset/download?assetId=606b32aeb14c4be2a6b7be6ea7127db2&experimentKey=fc2d3caec86243ef90f288667ec4c377',
   'api': 'https://www.comet.com/api/rest/v2/experiment/asset/get-asset?assetId=606b32aeb14c4be2a6b7be6ea7127db2&experimentKey=fc2d3caec86243ef90f288667ec4c377',
   'assetId': '606b32aeb14c4be2a6b7be6ea7127db2'}),
 ('added_tokens.json',
  {'web': 'https://www.comet.com/api/asset/download?assetId=f309deabc8364c7ebfb28c3e996329c5&experimentKey=fc2d3caec86243ef90f288667ec4c377',
   'api': 'https://www.comet.com/api/rest/v2/experiment/asset/get-asset?assetId=f309deabc8364c7ebfb28c3e996329c5&experimentKey=fc2d3caec86243ef90f288667ec4c377',
   'assetId': 'f309deabc8364c7ebfb28c3e996329c5'}),
 ('model.safetensors',
  {'web': 'https://www.comet.com/api/asset/download?assetId=f676c1317b2e406e849ea81d14ff3fd5&experimentKey=fc2d3caec86243ef90f288667ec4c377',
   'api': 'https://www.comet.com/api/rest/v2/experiment/asset/get-asset?

In [23]:
api=API()
experiment=api.get("emeline-caruana/poem-gen-ft-v2-2/tight_archipelago_986")
experiment.register_model("t5-finetuned")

[1;38;5;39mCOMET INFO:[0m Successfully registered 't5-finetuned', version None in workspace 'emeline-caruana'


### Test post FT

In [24]:
model = api.get_model("emeline-caruana", "t5-finetuned")
md= model.download("1.2.0")

[1;38;5;39mCOMET INFO:[0m Remote Model 'emeline-caruana/t5-finetuned:1.2.0' download has been started asynchronously.
[1;38;5;39mCOMET INFO:[0m Still downloading 7 file(s), remaining 945.25 MB/945.25 MB
[1;38;5;39mCOMET INFO:[0m Still downloading 1 file(s), remaining 765.47 MB/945.25 MB, Throughput 11.97 MB/s, ETA ~64s
[1;38;5;39mCOMET INFO:[0m Still downloading 1 file(s), remaining 550.47 MB/945.25 MB, Throughput 14.32 MB/s, ETA ~39s
[1;38;5;39mCOMET INFO:[0m Still downloading 1 file(s), remaining 328.47 MB/945.25 MB, Throughput 14.78 MB/s, ETA ~23s
[1;38;5;39mCOMET INFO:[0m Still downloading 1 file(s), remaining 121.47 MB/945.25 MB, Throughput 13.78 MB/s, ETA ~9s
[1;38;5;39mCOMET INFO:[0m Remote Model 'emeline-caruana/t5-finetuned:1.2.0' has been successfully downloaded.
[1;38;5;39mCOMET INFO:[0m Downloaded asset files is in '/tmp/tmpwupgh0sm' folder.


In [26]:
model_ft = T5ForConditionalGeneration.from_pretrained("/tmp/tmpwupgh0sm/t5-finetuned")
tokenizer_ft = T5Tokenizer.from_pretrained("/tmp/tmpwupgh0sm/t5-finetuned")

pip = pipeline("text2text-generation", model=model_ft, tokenizer=tokenizer_ft, device=0)

result = pip("Write a poem about dogs", max_new_tokens=120)
print("\nPoème généré :\n", result[0]['generated_text'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Poème généré :
 The grass lays on the soil, Grass and wild flowers do play And the trees are green and green And equatorial, and blue, green equatorial, and orange, And the leaves of flowers do wander, and the grass is ripen. In the land lies an equatorial plain, A forest of bare trees stands proudly on its branches Like a child's ear. The children play and play, and the men play on their shirts As they stand proudly on the rocks And the wood is red
