
# FineTune Using CoT

- Chain of thougth transcribiendo y traduciendo.
- Lora using Querys and Keys.


## 1  Instalación de dependencias

In [1]:
! pip install backoff transformers==4.49 peft
! pip install sacrebleu unbabel-comet



In [2]:
from google.colab import drive
import os

# Montar Google Drive
drive.mount('/content/drive')

# Rutas de los archivos zip en Google Drive
test_zip_path = '/content/drive/MyDrive/test.zip'
dev_zip_path = '/content/drive/MyDrive/dev.zip'

# Directorio de destino en Colab
colab_content_path = '/content/'

# Copiar los archivos zip a Colab
!cp "{test_zip_path}" "{colab_content_path}"
!cp "{dev_zip_path}" "{colab_content_path}"

print(f"Archivos copiados a {colab_content_path}")
print(os.listdir(colab_content_path))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archivos copiados a /content/
['.config', 'checkpoints', 'dev.zip', 'dev', 'test.zip', 'test', 'drive', 'sample_data']


In [3]:
!unzip /content/test.zip -d /content/test
!unzip /content/dev.zip -d /content/dev

Archive:  /content/test.zip
replace /content/test/test.lst? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/test/606/slides.pptx? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/test/606/606.m4a? [y]es, [n]o, [A]ll, [N]one, [r]ename: Archive:  /content/dev.zip
replace /content/dev/dev.lst? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## 2  Configuración principal

In [1]:
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

import torch

model_name = 'microsoft/Phi-4-multimodal-instruct'
processor = AutoProcessor.from_pretrained(model_name,trust_remote_code = True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    # flash_attention_2 eager sdpa
    _attn_implementation='sdpa',
)

generation_config = GenerationConfig.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
  lambda i: encoder_checkpoint_wrapper(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### 3.1  LoRA

In [2]:

from peft import LoraConfig, get_peft_model


lora_config = LoraConfig(
    r=8,                         # rango LoRA
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj","k_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 995,328 || all params: 5,575,455,552 || trainable%: 0.0179




### 3.2 Data

In [3]:
from pathlib import Path
import subprocess, soundfile as sf, datetime as dt

FFMPEG = "ffmpeg"          # o la ruta absoluta si no está en PATH

def m4a_to_wav(path_m4a, sr_out=16_000):
    """Convierte 1 × .m4a → .wav (mono, 16 kHz) y devuelve la ruta del WAV."""
    path_m4a = Path(path_m4a)
    wav_path = path_m4a.with_suffix(".wav")
    subprocess.run(
        [FFMPEG, "-loglevel", "error", "-y", "-i", str(path_m4a),
         "-ac", "1", "-ar", str(sr_out), str(wav_path)],
        check=True
    )
    return wav_path



def get_files(folder='test'):
  with open(f'/content/{folder}/{folder}.lst', 'r', encoding='utf-8') as f:
    files = f.read()
    files = files.splitlines()
    print(files)
    return files



In [4]:
import datasets as ds, soundfile as sf, datetime as dt, numpy as np
from pathlib import Path


def build_dataset(folder="train", language="es", files= None ):
    rows = []
    files = get_files(folder) if files is None else files
    for name in files:
        # ── leer anotaciones (NO cargamos audio) ────────────────────────
        with open(f"/content/{folder}/{name}/manual_translations/{language}/{name}.lst") as f:
            times = [(float(a),float(b)) for a,b in [ln.split() for ln in f.read().splitlines()]]
        with open(f"/content/{folder}/{name}/manual_translations/{language}/{name}.en") as f:
            src = f.read().splitlines()
        with open(f"/content/{folder}/{name}/manual_translations/{language}/{name}.{language}") as f:
            tgt = f.read().splitlines()

        assert len(times) == len(src) == len(tgt)
        wav_path = str(m4a_to_wav(f"/content/{folder}/{name}/{name}.m4a"))

        for (s,e), txt_in, txt_out in zip(times, src, tgt):
            rows.append({
                "wav_path": wav_path,
                "start":    s,
                "end":      e,
                "source_text": txt_in,
                "target_text": txt_out,
            })

    return ds.Dataset.from_list(rows)
train_ds = build_dataset("dev", files=['500011', '624000', '609'])
val_ds   = build_dataset("dev", files=['550000','592'])
test_ds   = build_dataset("test")

['404', '596001', '605000', '606', '545006']


In [5]:
print(train_ds)
print(val_ds)
print(test_ds)

Dataset({
    features: ['wav_path', 'start', 'end', 'source_text', 'target_text'],
    num_rows: 708
})
Dataset({
    features: ['wav_path', 'start', 'end', 'source_text', 'target_text'],
    num_rows: 742
})
Dataset({
    features: ['wav_path', 'start', 'end', 'source_text', 'target_text'],
    num_rows: 1405
})


In [6]:
import torch, soundfile as sf
from torch.nn.utils.rnn import pad_sequence

class CollatorPhi4Audio:
    def __init__(self, processor, answer_suffix="<|end|><|endoftext|>",
                 sr=16_000, ignore_idx=-100, cache=True, supervise_cot=False):
        self.proc, self.sr = processor, sr
        self.pad_id, self.suffix = processor.tokenizer.pad_token_id, answer_suffix
        self.ignore, self.cache_fd = ignore_idx, {} if cache else None
        self.supervise_cot = supervise_cot

    # util para no reabrir el mismo WAV a cada segmento
    def _read_segment(self, path, start, end):
        if self.cache_fd is not None:
            if path not in self.cache_fd:
                self.cache_fd[path] = sf.SoundFile(path)
            f = self.cache_fd[path]
        else:
            f = sf.SoundFile(path)

        frames = int((end - start) * self.sr)
        f.seek(int(start * self.sr))
        audio = f.read(frames, dtype="float32")
        return audio

    def __call__(self, batch):
        prompts, answers, wavs = [], [], []

        for ex in batch:
            prompts.append(

                self.proc.tokenizer.apply_chat_template(
                    [{"role":"user",
                      "content":"<|audio_1|>\nTranscribe the audio to text, and then translate the audio to es-ES. Use <sep> as a separator between the original transcript and the translation"}],
                    tokenize=False, add_generation_prompt=True)
            )
            answers.append(ex["source_text"]+ '<sep>'+ ex["target_text"] + self.suffix)

            wavs.append(
                (self._read_segment(ex["wav_path"], ex["start"], ex["end"]), self.sr)
            )

        # ---- Texto ----------------------------------------------------
        tok_in   = self.proc.tokenizer(prompts, return_tensors="pt", padding=True)
        tok_ans  = self.proc.tokenizer(answers, return_tensors="pt", padding=True)


        # ---- Audio → embeddings (pad internamente) -------------------
        # aud_feats = self.proc(
        #     wavs, sampling_rate=self.sr, return_tensors="pt", padding=True
        # )  # devuelve dict con 'input_audio_embeds', 'audio_attention_mask'


        input_ids = torch.cat([tok_in.input_ids, tok_ans.input_ids], dim=1)
        attn_mask = (input_ids != self.pad_id).long()

        # Inicialmente ignoramos todo
        labels = torch.full_like(input_ids, self.ignore)

        # Colocamos como labels toda la respuesta (ans_len)
        ans_len = tok_ans.input_ids.size(1)
        offset  = input_ids.size(1) - ans_len
        labels[:, offset:] = tok_ans.input_ids

        # ─── ENMASCARAR SOLO EL COT (antes de <sep>) ──────────────────────────────
        if not self.supervise_cot:
            # tokens que codifican "<sep>"
            sep_tok = self.proc.tokenizer("<sep>", add_special_tokens=False).input_ids
            sep_len = len(sep_tok)

            for i in range(input_ids.size(0)):       # por cada ejemplo del batch
                ans_tokens = tok_ans.input_ids[i]    # solo la parte de respuesta
                # buscamos la primera aparición de la subsecuencia sep_tok
                idx = -1
                for j in range(ans_tokens.size(0) - sep_len + 1):
                    if torch.equal(ans_tokens[j:j+sep_len], torch.tensor(sep_tok, device=ans_tokens.device)):
                        idx = j
                        break

                if idx != -1:
                    # Índices absolutos dentro de labels/input_ids
                    start_cot = offset                # arranque de la respuesta en input_ids
                    end_cot   = offset + idx          # justo antes de <sep>
                    labels[i, start_cot:end_cot] = self.ignore
                audio=self.proc(text=prompts, audios=wavs, return_tensors="pt", padding=True)

        return {
            "input_ids":            input_ids,
            "labels":               labels,
            "attention_mask":       attn_mask,
            "input_audio_embeds":   audio["input_audio_embeds"],
            "audio_attention_mask": audio.get("audio_attention_mask"),
            "input_mode":           torch.ones(len(batch), dtype=torch.long)*2,
        }


In [7]:
train_dataloader = torch.utils.data.DataLoader(
    train_ds,
    shuffle=True,
    batch_size=4,
    collate_fn=CollatorPhi4Audio(processor),
    pin_memory=True,
)
val_dataloader = torch.utils.data.DataLoader(
    val_ds,
    shuffle=False,
    batch_size=4,
    collate_fn=CollatorPhi4Audio(processor),
    pin_memory=True,
)

## 3  Entrenamiento

### 3.1 Helpers

In [8]:
import math, os, gc, torch
from tqdm import tqdm
from typing import Dict, List, Tuple

def move_batch(batch, device):
    return {
        k: v.to(device, dtype=torch.bfloat16 if v.dtype == torch.float else torch.long)
        for k, v in batch.items()
    }

@torch.no_grad()
def avg_loss_on_loader(model, dataloader, device):
    model.eval()
    losses, n = 0.0, 0
    for batch in dataloader:
        batch = move_batch(batch, device)
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            out = model(**batch)
        losses += out.loss.item() * batch[list(batch.keys())[0]].size(0)
        n += batch[list(batch.keys())[0]].size(0)
    model.train()
    return losses / n


### 3.2 Eval Definition

In [9]:
import soundfile as sf
from collections import defaultdict

_open_wavs = {}                  # caché: path → SoundFile

def read_segment(path, start, end, sr=16_000):
    """Lee [start,end] s de un WAV grande sin cargarlo entero."""
    if path not in _open_wavs:
        _open_wavs[path] = sf.SoundFile(path)
    f = _open_wavs[path]
    frames = int((end - start) * sr)
    f.seek(int(start * sr))
    return f.read(frames, dtype="float32")
def segments_by_audio(ds):
    groups = defaultdict(list)
    for ex in ds:
        groups[ex["wav_path"]].append(ex)
    # ordenamos cada audio por inicio temporal
    for segs in groups.values():
        segs.sort(key=lambda x: x["start"])
    return groups


In [10]:

val_groups  = segments_by_audio(val_ds)

In [11]:
from collections import defaultdict
from itertools import chain
import torch, gc
from tqdm import tqdm

device        = torch.device("cuda:0")
batch_size    = 4

user_prompt      = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix    = '<|end|>'
speech_prompt = 'Transcribe the audio to text, and then translate the audio to es-ES. Use <sep> as a separator between the original transcript and the translation.'
CHAT_PROMPT = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'



def generate_audio(groups):
    hyps_all, refs_all, srcs_all = [], [], []

    for wav_path, segs in tqdm(groups.items(), desc="Audios"):
        hyps, refs, srcs = [], [], []

        # dividimos en lotes (batch_size) para GPU
        for i in range(0, len(segs), batch_size):
            chunk = segs[i:i+batch_size]

            wavs   = [(read_segment(s["wav_path"], s["start"], s["end"]),16_000) for s in chunk]
            prompts= [CHAT_PROMPT] * len(chunk)

            inputs = processor(text=prompts, audios=wavs,
                               return_tensors="pt", padding=True
                               ).to(device)

            with torch.inference_mode(), torch.autocast("cuda"):
                gen_ids = model.generate(**inputs, generation_config=generation_config, max_new_tokens=1000, num_logits_to_keep=0)

            # quitamos el prompt textual
            gen_ids = gen_ids[:, inputs["input_ids"].shape[1]:]

            hyps.extend(
                processor.batch_decode(gen_ids.cpu(),
                                       skip_special_tokens=True,
                                       clean_up_tokenization_spaces=False)
            )

            # limpieza VRAM
            del inputs, gen_ids, wavs
            torch.cuda.empty_cache()
            gc.collect()

            refs.extend([s["target_text"] for s in chunk])
            srcs.extend([s["source_text"] for s in chunk])

        print(hyps)
        print(refs)
        print(srcs)

        aux=[]
        for b in hyps:
          if '<sep>' in b:
            aux.append(b.split('<sep>')[1].strip())
          else:
            aux.append(b)
            print(b)

        hyps_all.append(aux)
        refs_all.append(refs)
        srcs_all.append(srcs)

    return hyps_all, refs_all, srcs_all



In [12]:
from __future__ import annotations

from itertools import chain
from typing import Callable, Dict, List, Tuple

import sacrebleu
from comet import download_model, load_from_checkpoint

# ────────────────────────────────────────────────────────────────────────────────
#  Cargamos (y cacheamos) el modelo COMET‑22 una sola vez
# ────────────────────────────────────────────────────────────────────────────────
_COMET_MODEL = None
_COMET_PATH = None
_MODEL_NAME = "Unbabel/wmt22-comet-da"


def _get_comet_model(gpus: int = 0):
    """Devuelve un modelo COMET‑22 listo para `predict` (cacheado)."""
    global _COMET_MODEL, _COMET_PATH

    if _COMET_MODEL is None:
        _COMET_PATH = download_model(_MODEL_NAME)  # se guarda en ~/.cache
        _COMET_MODEL = load_from_checkpoint(_COMET_PATH)

    # *Nota*: el parámetro `gpus` se pasa a `.predict` y **no** aquí, pero
    # exponemos el arg para quien quiera forzar CPU en la firma pública.
    return _COMET_MODEL


# ────────────────────────────────────────────────────────────────────────────────
#  Función principal
# ────────────────────────────────────────────────────────────────────────────────

def bleu_comet_by_audio(
    refs_audio: List[List[str]],
    hyps_audio: List[List[str]],
    srcs_audio: List[List[str]],
    transform: Callable[[str], str] = lambda x: x,
    comet_gpus: int = 0,
    comet_batch_size: int = 8,
) -> Tuple[Dict[str, float], List[Dict[str, float]]]:
    """Calcula BLEU y COMET‑22, **global** y **por audio**.

    Parameters
    ----------
    refs_audio, hyps_audio, srcs_audio : list[list[str]]
        Listas anidadas con el mismo nº de audios y segmentos.
    transform : callable
        Función de normalización por frase (identidad por defecto).
    comet_gpus : int
        Nº de GPUs a usar en `model.predict` (0 ⇒ CPU).
    comet_batch_size : int
        Tamaño de lote para COMET (trade‑off velocidad / memoria).

    Returns
    -------
    (global_metrics, per_audio_metrics)
        global_metrics    = {"bleu": float, "comet22": float}
        per_audio_metrics = [
            {"audio_id": i, "bleu": float, "comet22": float},
            ...
        ]
    """

    # ── Comprobaciones básicas ──────────────────────────────────────────────
    assert len(refs_audio) == len(hyps_audio) == len(srcs_audio), (
        "refs, hyps y srcs deben tener la misma longitud (nº audios)"
    )

    per_audio: List[Dict[str, float]] = []

    # ── Recorremos audio por audio ──────────────────────────────────────────
    comet_model = _get_comet_model(gpus=comet_gpus)

    for idx, (ref_seg, hyp_seg, src_seg) in enumerate(
        zip(refs_audio, hyps_audio, srcs_audio)
    ):
        assert len(ref_seg) == len(hyp_seg) == len(src_seg), (
            f"El audio {idx} contiene diferente nº de segmentos"
        )

        ref_seg = [transform(r) for r in ref_seg]
        hyp_seg = [transform(h) for h in hyp_seg]
        src_seg = [transform(s) for s in src_seg]

        # ── BLEU corpus‑level para el audio ────────────────────────────────
        bleu_score = sacrebleu.corpus_bleu(hyp_seg, [ref_seg]).score

        # ── COMET‑22 ───────────────────────────────────────────────────────
        samples = [  # una entrada por segmento
            {"src": s, "mt": h, "ref": r}
            for s, h, r in zip(src_seg, hyp_seg, ref_seg)
        ]
        comet_out = comet_model.predict(
            samples,
            batch_size=comet_batch_size,
            gpus=comet_gpus,
            progress_bar=False,
        )
        comet_score = comet_out["system_score"]  # media ya calculada

        per_audio.append(
            {
                "audio_id": idx,
                "bleu": bleu_score,
                "comet22": comet_score,
            }
        )

    # ── Métricas globales ──────────────────────────────────────────────────
    refs_all = list(chain.from_iterable(refs_audio))
    hyps_all = list(chain.from_iterable(hyps_audio))
    srcs_all = list(chain.from_iterable(srcs_audio))

    bleu_global = sacrebleu.corpus_bleu(hyps_all, [refs_all]).score

    comet_samples = [
        {"src": s, "mt": h, "ref": r}
        for s, h, r in zip(srcs_all, hyps_all, refs_all)
    ]
    comet_global = comet_model.predict(
        comet_samples,
        batch_size=comet_batch_size,
        gpus=comet_gpus,
        progress_bar=False,
    )["system_score"]

    return {"bleu": bleu_global, "comet22": comet_global}, per_audio





In [13]:
def eval_bleu_comet_fn_val(groups_val, processor, model, generation_config, device,
                           batch_size=8, transform=lambda x: x):
    # 1) Generar hipótesis
    hyps, refs, srcs = generate_audio(groups_val)  # usa tu función tal cual

    # 2) Calcular métricas
    global_metrics, per_audio = bleu_comet_by_audio(
        refs_audio=refs,
        hyps_audio=hyps,
        srcs_audio=srcs,
        transform=transform,
        comet_gpus=1,        # o 0 si prefieres CPU
        comet_batch_size=8
    )
    return global_metrics, per_audio


### 3.3 Train Val

In [14]:


def train(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    acc_steps=2,
    max_epochs=5,
    clip_grad=1.0,
    scheduler=None,
    scheduler_type=None,
    ckpt_dir="checkpoints",
    patience=3,                       # early stopping
    eval_bleu_comet_fn=None,          # función que calcule métricas de traducción (ver sección 3)
    eval_bleu_args=None,              # dict opcional para pasar a esa función
    verbose_every=20
):
    os.makedirs(ckpt_dir, exist_ok=True)
    best_metric = -math.inf
    no_improve = 0

    model.to(device)
    model.train()
    optimizer.zero_grad(set_to_none=True)

    scaler = torch.cuda.amp.GradScaler(enabled=False)  # bfloat16 no necesita scaler; activarlo si usas fp16

    for epoch in range(1, max_epochs+1):
        print(f"\nEpoch {epoch}/{max_epochs}")
        running = 0.0
        model.train()
        for step, batch in enumerate(tqdm(train_loader)):
            batch = move_batch(batch, device)

            with torch.autocast("cuda", dtype=torch.bfloat16):
                outputs = model(**batch)
                loss = outputs.loss / acc_steps

            loss.backward()

            running += loss.item() * acc_steps

            if (step + 1) % acc_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)

            if step % verbose_every == 0:
                print(f" step {step:<4} | loss = {loss.item() * acc_steps:.4f}")

        # ─── VALIDACIÓN ───────────────────────────────────────────
        model.eval()

        # Temporarily disable gradient checkpointing for evaluation
        for module in model.modules():
            if hasattr(module, 'gradient_checkpointing'):
                module.gradient_checkpointing = False

        val_loss = avg_loss_on_loader(model, val_loader, device)
        print(f"  -> val_loss: {val_loss:.4f}")

        # Métricas de traducción opcionales (BLEU/COMET)
        extra_metrics = {}
        if eval_bleu_comet_fn is not None:
            metrics_global, metrics_per_audio = eval_bleu_comet_fn(**(eval_bleu_args or {}))
            print(f"  -> BLEU: {metrics_global['bleu']:.2f} | COMET22: {metrics_global['comet22']:.3f}")
            extra_metrics = metrics_global
            current_key_metric = metrics_global["comet22"]  # decide cuál usar para “mejor modelo”
        else:
            current_key_metric = -val_loss  # si no hay métricas externas, usa la loss

        # Re-enable gradient checkpointing
        for module in model.modules():
            if hasattr(module, 'gradient_checkpointing') and eval_bleu_comet_fn is not None: # Only re-enable if it was originally True and we used the eval function
                 module.gradient_checkpointing = True


         # ---------- Scheduler step ----------
        if scheduler is not None:
            if scheduler_type == "plateau":
                scheduler.step(val_loss)               # en función de la loss
            else:
                scheduler.step()
        # ─── Checkpoint / Early stopping ─────────────────────────
        if current_key_metric > best_metric:
            best_metric = current_key_metric
            no_improve = 0
            path = os.path.join(ckpt_dir, f"best_epoch{epoch}.pt")
            torch.save({"model_state": model.state_dict(),
                        "optimizer_state": optimizer.state_dict(),
                        "epoch": epoch,
                        "val_loss": val_loss,
                        **extra_metrics}, path)
            print(f"  ✔ Nuevo mejor modelo guardado en {path}")
        else:
            no_improve += 1
            print(f"  (sin mejora {no_improve}/{patience})")
            if no_improve >= patience:
                print("  ✖ Early stopping activado.")
                break

        # limpieza VRAM
        torch.cuda.empty_cache(); gc.collect()

In [15]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
eval_args = dict(
    groups_val=val_groups,                # dict {wav_path: [segs...]}
    processor=processor,
    model=model,
    generation_config=generation_config,
    device=device,
    batch_size=8,
    transform=lambda x: x
)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)


scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1, min_lr=1e-7)

train(model, train_dataloader, val_dataloader, optimizer, device,
      scheduler=scheduler, scheduler_type="plateau",
      acc_steps=8, max_epochs=10, patience=3,)
      # eval_bleu_comet_fn=eval_bleu_comet_fn_val,
      # eval_bleu_args=eval_args)


  scaler = torch.cuda.amp.GradScaler(enabled=False)  # bfloat16 no necesita scaler; activarlo si usas fp16



Epoch 1/10


  1%|          | 1/177 [00:01<03:35,  1.22s/it]

 step 0    | loss = 5.7365


 12%|█▏        | 21/177 [00:16<02:03,  1.27it/s]

 step 20   | loss = 3.2231


 23%|██▎       | 41/177 [00:33<01:45,  1.29it/s]

 step 40   | loss = 1.9532


 34%|███▍      | 61/177 [00:48<01:31,  1.27it/s]

 step 60   | loss = 1.6880


 46%|████▌     | 81/177 [01:03<01:17,  1.24it/s]

 step 80   | loss = 1.7281


 57%|█████▋    | 101/177 [01:17<01:04,  1.19it/s]

 step 100  | loss = 1.2726


 68%|██████▊   | 121/177 [01:32<00:40,  1.38it/s]

 step 120  | loss = 1.0562


 80%|███████▉  | 141/177 [01:47<00:26,  1.37it/s]

 step 140  | loss = 1.3944


 91%|█████████ | 161/177 [02:01<00:09,  1.69it/s]

 step 160  | loss = 1.6915


100%|██████████| 177/177 [02:12<00:00,  1.33it/s]


  -> val_loss: 1.2995
  ✔ Nuevo mejor modelo guardado en checkpoints/best_epoch1.pt

Epoch 2/10


  1%|          | 1/177 [00:00<01:54,  1.54it/s]

 step 0    | loss = 1.5800


 12%|█▏        | 21/177 [00:17<02:06,  1.23it/s]

 step 20   | loss = 0.9760


 23%|██▎       | 41/177 [00:32<01:39,  1.37it/s]

 step 40   | loss = 0.9818


 34%|███▍      | 61/177 [00:47<01:25,  1.36it/s]

 step 60   | loss = 1.4539


 46%|████▌     | 81/177 [01:02<01:01,  1.56it/s]

 step 80   | loss = 1.6254


 57%|█████▋    | 101/177 [01:16<00:51,  1.48it/s]

 step 100  | loss = 1.2842


 68%|██████▊   | 121/177 [01:32<00:41,  1.35it/s]

 step 120  | loss = 1.1286


 80%|███████▉  | 141/177 [01:47<00:28,  1.26it/s]

 step 140  | loss = 1.6337


 91%|█████████ | 161/177 [02:02<00:14,  1.12it/s]

 step 160  | loss = 1.5065


100%|██████████| 177/177 [02:15<00:00,  1.31it/s]


  -> val_loss: 1.2290
  ✔ Nuevo mejor modelo guardado en checkpoints/best_epoch2.pt

Epoch 3/10


  1%|          | 1/177 [00:00<02:18,  1.27it/s]

 step 0    | loss = 1.4007


 12%|█▏        | 21/177 [00:16<01:57,  1.33it/s]

 step 20   | loss = 0.7251


 23%|██▎       | 41/177 [00:33<01:49,  1.25it/s]

 step 40   | loss = 1.4228


 34%|███▍      | 61/177 [00:47<01:16,  1.51it/s]

 step 60   | loss = 1.3641


 46%|████▌     | 81/177 [01:01<00:54,  1.77it/s]

 step 80   | loss = 1.4574


 57%|█████▋    | 101/177 [01:16<01:00,  1.25it/s]

 step 100  | loss = 1.1650


 68%|██████▊   | 121/177 [01:31<00:48,  1.16it/s]

 step 120  | loss = 1.1996


 80%|███████▉  | 141/177 [01:46<00:24,  1.47it/s]

 step 140  | loss = 1.3971


 91%|█████████ | 161/177 [02:01<00:12,  1.32it/s]

 step 160  | loss = 0.9644


100%|██████████| 177/177 [02:12<00:00,  1.33it/s]


  -> val_loss: 1.2154
  ✔ Nuevo mejor modelo guardado en checkpoints/best_epoch3.pt

Epoch 4/10


  1%|          | 1/177 [00:00<01:34,  1.86it/s]

 step 0    | loss = 1.1966


 12%|█▏        | 21/177 [00:15<02:01,  1.29it/s]

 step 20   | loss = 1.1285


 23%|██▎       | 41/177 [00:31<01:43,  1.32it/s]

 step 40   | loss = 1.0456


 34%|███▍      | 61/177 [00:45<01:17,  1.50it/s]

 step 60   | loss = 1.1337


 46%|████▌     | 81/177 [00:58<00:54,  1.75it/s]

 step 80   | loss = 1.0100


 57%|█████▋    | 101/177 [01:14<01:01,  1.24it/s]

 step 100  | loss = 1.1804


 68%|██████▊   | 121/177 [01:28<00:40,  1.39it/s]

 step 120  | loss = 1.2734


 80%|███████▉  | 141/177 [01:45<00:33,  1.07it/s]

 step 140  | loss = 1.4076


 91%|█████████ | 161/177 [02:00<00:09,  1.65it/s]

 step 160  | loss = 1.0007


100%|██████████| 177/177 [02:10<00:00,  1.35it/s]


  -> val_loss: 1.2166
  (sin mejora 1/3)

Epoch 5/10


  1%|          | 1/177 [00:00<01:48,  1.62it/s]

 step 0    | loss = 1.1653


 12%|█▏        | 21/177 [00:14<02:06,  1.23it/s]

 step 20   | loss = 1.2715


 23%|██▎       | 41/177 [00:31<01:49,  1.24it/s]

 step 40   | loss = 1.1410


 34%|███▍      | 61/177 [00:43<01:10,  1.65it/s]

 step 60   | loss = 1.3180


 46%|████▌     | 81/177 [00:59<01:03,  1.52it/s]

 step 80   | loss = 0.9360


 57%|█████▋    | 101/177 [01:13<00:46,  1.63it/s]

 step 100  | loss = 0.7267


 68%|██████▊   | 121/177 [01:29<00:39,  1.41it/s]

 step 120  | loss = 1.1127


 80%|███████▉  | 141/177 [01:44<00:24,  1.44it/s]

 step 140  | loss = 0.8982


 91%|█████████ | 161/177 [01:59<00:11,  1.40it/s]

 step 160  | loss = 0.9038


100%|██████████| 177/177 [02:11<00:00,  1.35it/s]


  -> val_loss: 1.2203
  (sin mejora 2/3)

Epoch 6/10


  1%|          | 1/177 [00:00<02:40,  1.09it/s]

 step 0    | loss = 0.8885


 12%|█▏        | 21/177 [00:15<01:50,  1.42it/s]

 step 20   | loss = 0.6405


 23%|██▎       | 41/177 [00:30<01:54,  1.18it/s]

 step 40   | loss = 1.2439


 34%|███▍      | 61/177 [00:45<01:33,  1.24it/s]

 step 60   | loss = 1.0152


 46%|████▌     | 81/177 [01:00<01:09,  1.37it/s]

 step 80   | loss = 1.0582


 57%|█████▋    | 101/177 [01:14<01:00,  1.25it/s]

 step 100  | loss = 0.6389


 68%|██████▊   | 121/177 [01:29<00:34,  1.64it/s]

 step 120  | loss = 1.4308


 80%|███████▉  | 141/177 [01:44<00:29,  1.24it/s]

 step 140  | loss = 1.0865


 91%|█████████ | 161/177 [02:00<00:13,  1.18it/s]

 step 160  | loss = 0.8389


100%|██████████| 177/177 [02:11<00:00,  1.34it/s]


  -> val_loss: 1.2200
  (sin mejora 3/3)
  ✖ Early stopping activado.


## 4  Evaluación  (BLEU y COMET)

### Evaluacion completa

In [16]:

torch.cuda.empty_cache(); gc.collect()


# Cargar checkpoint
ckpt = torch.load("checkpoints/best_epoch3.pt", map_location=device)
model.load_state_dict(ckpt["model_state"], strict=False)


model.gradient_checkpointing_disable()          # importante para evitar el bug/anular GC
model.eval()
model.config.use_cache = True                   # más memoria, pero más rápido


In [17]:
# Carga mejor checkpoint si quieres

groups  = segments_by_audio(test_ds)
print(f"{len(groups)=} audios")
hyps, refs, srcs = generate_audio(groups)



len(groups)=5 audios


Audios:  20%|██        | 1/5 [12:43<50:55, 763.95s/it]

['Buenos días, buenas tardes, buenas noches.', 'Les habla la presentadora de la conferencia.', 'La Escuela Europea de Oncología les da la bienvenida a su sesión virtual 404.', 'La sesión virtual de hoy está dedicada al impacto de la cirugía oncológica en los resultados.', 'Le recordamos que esta actividad está acreditada por el CME.', 'Al final de la presentación, al cerrar la ventana de retransmisión, se le redirigirá a la evaluación de CME y al test de opciones múltiples.', 'Se pedirá a los Mastermind competitors que empiecen el test una vez finalizada la evaluación.', 'Como recordatorio durante la sesión en directo, todos los participantes pueden formular preguntas en directo en cualquier momento, simplemente pulsando el botón correspondiente en la parte superior de la página.', 'Esto es un extracto de la política que se publica en nuestro sitio web en su totalidad.', 'Las preguntas serán recibidas por el profesor Riccardo Audisio del Western Hospital, Universidad de Liverpool, Pres

Audios:  40%|████      | 2/5 [27:56<42:34, 851.60s/it]

['Hello everybody. <sep> Hola a todos.', "So it's a pleasure to be here and to have been invited by Professor Franco to share with you my experience on the evaluation of treatment plans in radiotherapy. <sep> Es un placer estar aquí y haber sido invitado por el Profesor Franco a compartir con ustedes mi experiencia sobre la evaluación de los planes de tratamiento en radioterapia.", 'I am Nuria Jorret. I am consultant medical physicist, clinical head of radiation physics in the hospital of La Santa Creu i Sant Pau in Barcelona. <sep> Soy Nuria Jorret. Soy consultora médica, física clínica, jefa de física de radiación en el hospital de La Santa Creu i Sant Pau en Barcelona.', 'So the learning objectives of this lecture will be to give you an overview of the radiation therapy treatment process. <sep> Así que los objetivos de aprendizaje de esta conferencia serán darles una visión general del proceso de tratamiento de radioterapia.', 'We will analyze the different angles of treatment plan 

Audios:  60%|██████    | 3/5 [47:54<33:39, 1009.66s/it]

['Thank you very much. <sep> Muchas gracias.', "It's a pleasure to talk to you today. <sep> Es un placer hablar con usted hoy.", "and hopefully we'll have some interesting questions that come up as well. <sep>Y espero que tengamos algunas preguntas interesantes que surjan también.", "Let me just i'm gonna share my slides and put them on. <sep> Déjame compartir mis diapositivas y ponerlas.", 'Okay, great. <sep>De acuerdo, genial.', 'so our topic today is things that about advanced breast cancer that keep us awake at night <sep> Así que nuestro tema de hoy son las cosas que sobre el cáncer de mama avanzado que nos mantienen despiertos por las noches.', "And indeed, there are many different topics that we could discuss here because I was thinking about it as I was finishing up my slides and thinking that there are so many things that can keep us awake and taking care of our patients, but we're gonna focus on just a few of them. <sep> Y, de hecho, hay muchos temas diferentes que podríamos 

Audios:  80%|████████  | 4/5 [1:00:48<15:16, 916.66s/it]

['Hello everyone. <sep> Hola a todos.', "My name is Ramon de Mello and I'm very glad here to coordinate this multi-disciplinary session on rectal cancer. <sep> Mi nombre es Ramon de Mello y estoy muy contento aquí de coordinar esta sesión multidisciplinaria sobre el cáncer de recto.", 'Today we have here experts from all the world that will discuss multidisciplinary how can we better approach head and neck cancer in order to improve the outcomes. <sep> Hoy tenemos aquí expertos de todo el mundo que discutirán la multidisciplinariedad, cómo podemos abordar mejor el cáncer de cabeza y cuello para mejorar los resultados.', 'and we have two experts that will give their presentation a medical oncologist dr katia peris and a radiation oncologist professor maria antonieta gambacorta <sep> y tenemos dos expertos que darán su presentación, un oncólogo médico Dr. Katia Peris y un oncólogo radiactivo, la profesora Maria Antonieta Gambacorta.', "So we start with professor with Dr. Katia Perez that

Audios: 100%|██████████| 5/5 [1:07:18<00:00, 807.63s/it]

["So i'm gonna talk about the importance of multidisciplinary care. <sep> Así que voy a hablar sobre la importancia de la atención multidisciplinaria.", 'As you heard, my name is Anne Partridge. <sep> Como escucharon, mi nombre es Anne Partridge.', "I'm a medical oncologist at Dana-farber where I focus on breast cancer in young women in particular. <sep> Soy una oncóloga médica en Dana-farber, donde me concentro en el cáncer de mama en mujeres jóvenes en particular.", "And I can't emphasize enough the importance of multidisciplinary care for all patients with breast cancer, but particularly for young patients. <sep> Y no puedo enfatizar lo suficiente la importancia de la atención multidisciplinaria para todos los pacientes con cáncer de mama, pero particularmente para los pacientes jóvenes.", "and i'll give you an example of why from my own practice as well as discuss a little bit of the history and what it entails <sep> y te daré un ejemplo de por qué de mi propia práctica, así como d




In [18]:
global_metrics, per_audio = bleu_comet_by_audio(refs, hyps, srcs,
                                                comet_gpus=1, comet_batch_size=8)





Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/sta

In [19]:

print("BLEU  global :", global_metrics["bleu"])
print("COMET-22    :", global_metrics["comet22"])

BLEU  global : 37.49498782740432
COMET-22    : 0.8080160766097575


In [20]:
per_audio

[{'audio_id': 0, 'bleu': 42.693276751950094, 'comet22': 0.8358393742080115},
 {'audio_id': 1, 'bleu': 42.647418746510155, 'comet22': 0.8175117313172776},
 {'audio_id': 2, 'bleu': 29.04410057197415, 'comet22': 0.7953448302703774},
 {'audio_id': 3, 'bleu': 35.68932236947614, 'comet22': 0.7896460175290982},
 {'audio_id': 4, 'bleu': 42.639127476791444, 'comet22': 0.8297280061001681}]