## Import

In [None]:
!unzip /content/test.zip -d /content/test

Archive:  /content/test.zip
   creating: /content/test/404/
   creating: /content/test/545006/
   creating: /content/test/596001/
   creating: /content/test/605000/
   creating: /content/test/606/
  inflating: /content/test/test.lst  
   creating: /content/test/606/manual_transcription/
   creating: /content/test/606/manual_translations/
  inflating: /content/test/606/slides.pptx  
  inflating: /content/test/606/606.m4a  
   creating: /content/test/606/manual_transcription/sentence_segmented/
  inflating: /content/test/606/manual_transcription/ie606.srt  
  inflating: /content/test/606/manual_transcription/ie606.txt  
  inflating: /content/test/606/manual_transcription/sentence_segmented/ie606.srt  
   creating: /content/test/606/manual_translations/de/
   creating: /content/test/606/manual_translations/es/
   creating: /content/test/606/manual_translations/fr/
   creating: /content/test/606/manual_translations/sl/
  inflating: /content/test/606/manual_translations/fr/606.lst  
  infla

In [1]:
! pip install  backoff srt  transformers==4.49
! pip install sacrebleu unbabel-comet

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting srt
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.49
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
Building wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22427 sha256=afd18937173651398f5facae6ab47d60f7f5b32d6cde4190c6d5c89d7351293c
  Stored in directory: /root/.cache/pip/wheels/1f/43/f1/23ee9119497fcb57d9f7046fbf34c6d9027c46a1fa7824cf08
Succes

## Load

### Load example

In [None]:

transcription = open('/content/test/404/manual_transcription/ie404.txt', 'r')
transcription = transcription.read()


In [None]:
import srt
srt_file_path = '/content/test/404/manual_transcription/sentence_segmented/ie404.srt'

with open(srt_file_path, 'r', encoding='utf-8') as f:
  content = f.read()

parser = srt.parse(content)
trans = list(parser)
print(f"Se encontraron {len(trans)} transcripciones.")

Se encontraron 182 transcripciones.


In [None]:
for subtitle in trans:
        print("-" * 20)
        print(f"Número de secuencia: {subtitle.index}")
        print(f"Inicio: {subtitle.start}") # Un objeto datetime.timedelta
        print(f"Fin: {subtitle.end}")   # Un objeto datetime.timedelta
        # El texto del subtítulo, con las etiquetas HTML eliminadas por defecto por srt.parse
        print(f"Texto:\n{subtitle.content}")

--------------------
Número de secuencia: 1
Inicio: 0:00:00
Fin: 0:00:02.130000
Texto:
Good morning, afternoon and evening.
--------------------
Número de secuencia: 2
Inicio: 0:00:02.430000
Fin: 0:00:04.160000
Texto:
This is the conference operator.
--------------------
Número de secuencia: 3
Inicio: 0:00:04.310000
Fin: 0:00:08.950000
Texto:
The European School of Oncology welcomes you to their four hundred and forth e-session.
--------------------
Número de secuencia: 4
Inicio: 0:00:09.250000
Fin: 0:00:13.670000
Texto:
Today's e-session is the impact of oncological surgery on the outcomes.
--------------------
Número de secuencia: 5
Inicio: 0:00:13.980000
Fin: 0:00:16.930000
Texto:
Please note that this activity is CME accredited.
--------------------
Número de secuencia: 6
Inicio: 0:00:17.230000
Fin: 0:00:24.870000
Texto:
At the end of the presentation by closing the webcast window you will be directed to the CME evaluation and multiple choice test.
--------------------
Número de se

In [None]:
from pathlib import Path
import subprocess, soundfile as sf, datetime as dt

FFMPEG = "ffmpeg"          # o la ruta absoluta si no está en PATH

def m4a_to_wav(path_m4a, sr_out=16_000):
    """Convierte 1 × .m4a → .wav (mono, 16 kHz) y devuelve la ruta del WAV."""
    path_m4a = Path(path_m4a)
    wav_path = path_m4a.with_suffix(".wav")
    subprocess.run(
        [FFMPEG, "-loglevel", "error", "-y", "-i", str(path_m4a),
         "-ac", "1", "-ar", str(sr_out), str(wav_path)],
        check=True
    )
    return wav_path


def _to_seconds(t):
    """Acepta timedelta, str 'H:M:S.micros' o número ya en segundos."""
    if isinstance(t, (int, float)):
        return float(t)
    if isinstance(t, dt.timedelta):
        return t.total_seconds()
    if isinstance(t, str):
        h, m, s = t.split(":")
        return int(h) * 3600 + int(m) * 60 + float(s)
    raise TypeError(f"Tipo de tiempo no soportado: {type(t)}")


def cut_segments(wav_path, transcripciones, sr=16_000):
    """
    Extrae los fragmentos indicados por .start / .end de cada objeto `seg`
    (timedelta, str o float s) y devuelve una lista de np.ndarray.
    """
    clips = []
    with sf.SoundFile(str(wav_path)) as f:
        if f.samplerate != sr:
            raise ValueError(f"El WAV está a {f.samplerate} Hz, no a {sr}")
        for seg in transcripciones:
            start_s = _to_seconds(seg.start)
            end_s   = _to_seconds(seg.end)
            f.seek(int(start_s * sr))
            frames  = int((end_s - start_s) * sr)
            clips.append(f.read(frames, dtype="float32"))
    return clips, sr                             # sr devuelto solo una vez




In [None]:
wav_file = m4a_to_wav("/content/test/404/404.m4a")
clips, sr = cut_segments(wav_file, trans)

print(f"{len(clips)=}, {sr=}, {clips[0].shape=}")

len(clips)=182, sr=16000, clips[0].shape=(34080,)


In [None]:
clips[0]

array([ 0.        ,  0.        ,  0.        , ..., -0.00665283,
       -0.00775146, -0.0088501 ], dtype=float32)

In [None]:

with open('/content/test/404/manual_translations/es/404.es', 'r', encoding='utf-8') as f:
  content = f.read()

In [None]:
content = content.splitlines()
content[:10]

['Buenos días, buenas tardes, buenas noches.',
 'Les habla la presentadora de la conferencia.',
 'La Escuela Europea de Oncología les da la bienvenida a su sesión virtual 404.',
 'La sesión virtual de hoy está dedicada al impacto de la cirugía oncológica en los resultados.',
 'Le recordamos que esta actividad está acreditada por el CME.',
 'Al final de la presentación, al cerrar la ventana de retransmisión, se le redirigirá a la evaluación de CME y al test de opciones múltiples.',
 'Se pedirá a los Mastermind competitors que empiecen el test una vez finalizada la evaluación.',
 'Como recordatorio durante la sesión en directo, todos los participantes pueden formular preguntas en directo en cualquier momento, simplemente pulsando el botón correspondiente en la parte superior de la página.',
 'Esto es un extracto de la política que se publica en nuestro sitio web en su totalidad.',
 'Las preguntas serán recibidas por el profesor Riccardo Audisio del Western Hospital, Universidad de Liverp

### Load Test data

In [None]:
from pathlib import Path
import subprocess, soundfile as sf, datetime as dt

FFMPEG = "ffmpeg"          # o la ruta absoluta si no está en PATH

def m4a_to_wav(path_m4a, sr_out=16_000):
    """Convierte 1 × .m4a → .wav (mono, 16 kHz) y devuelve la ruta del WAV."""
    path_m4a = Path(path_m4a)
    wav_path = path_m4a.with_suffix(".wav")
    subprocess.run(
        [FFMPEG, "-loglevel", "error", "-y", "-i", str(path_m4a),
         "-ac", "1", "-ar", str(sr_out), str(wav_path)],
        check=True
    )
    return wav_path


def _to_seconds(t):
    """Acepta timedelta, str 'H:M:S.micros' o número ya en segundos."""
    if isinstance(t, (int, float)):
        return float(t)
    if isinstance(t, dt.timedelta):
        return t.total_seconds()
    if isinstance(t, str):
        h, m, s = t.split(":")
        return int(h) * 3600 + int(m) * 60 + float(s)
    raise TypeError(f"Tipo de tiempo no soportado: {type(t)}")


def cut_segments(wav_path, transcripciones, sr=16_000):
    """
    Extrae los fragmentos indicados por .start / .end de cada objeto `seg`
    (timedelta, str o float s) y devuelve una lista de np.ndarray.
    """
    clips = []
    with sf.SoundFile(str(wav_path)) as f:
        if f.samplerate != sr:
            raise ValueError(f"El WAV está a {f.samplerate} Hz, no a {sr}")
        for seg in transcripciones:
            start_s = float(seg.split()[0])
            end_s   = float(seg.split()[1])
            f.seek(int(start_s * sr))
            frames  = int((end_s - start_s) * sr)
            clips.append(f.read(frames, dtype="float32"))
    return clips, sr                             # sr devuelto solo una vez


In [None]:
import srt
with open('/content/test/test.lst', 'r', encoding='utf-8') as f:
  folders = f.read()
  folders = folders.splitlines()
  print(folders)

['404', '596001', '605000', '606', '545006']


In [None]:
folders[3:]

['606', '545006']

In [None]:
def get_audio_segments(name, folder='test', language='es'):
  print(f'/content/{folder}/{name}/{language}')

  with open(f'/content/{folder}/{name}/manual_translations/{language}/{name}.lst', 'r', encoding='utf-8') as f:
    times = f.read()
    times = times.splitlines()
  print(f"Se encontraron {len(times)} secciones de frases.")

  with open(f'/content/{folder}/{name}/manual_translations/{language}/{name}.en', 'r', encoding='utf-8') as f:
    trans = f.read()
    trans = trans.splitlines()
  print(f"Se encontraron {len(trans)} transcripciones.")

  with open(f'/content/{folder}/{name}/manual_translations/{language}/{name}.{language}', 'r', encoding='utf-8') as f:
    traduccion = f.read()
    traduccion = traduccion.splitlines()
  print(f"Se encontraron {len(traduccion)} traducciones.")

  wav_file = m4a_to_wav(f"/content/{folder}/{name}/{name}.m4a")
  clips, sr = cut_segments(wav_file, times)

  print(f"{len(clips)=}, {sr=}, {clips[0].shape=}")
  return trans, traduccion, clips, sr

### Load Model

In [None]:
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

import torch

model_name = 'microsoft/Phi-4-multimodal-instruct'
processor = AutoProcessor.from_pretrained(model_name,trust_remote_code = True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    # if you do not use Ampere or later GPUs, change attention to "eager"
    _attn_implementation='eager',
)

generation_config = GenerationConfig.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

processing_phi4mm.py:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- processing_phi4mm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


preprocessor_config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.91M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.63k [00:00<?, ?B/s]

configuration_phi4mm.py:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- configuration_phi4mm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi4mm.py:   0%|          | 0.00/116k [00:00<?, ?B/s]

vision_siglip_navit.py:   0%|          | 0.00/78.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- vision_siglip_navit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


speech_conformer_encoder.py:   0%|          | 0.00/111k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- speech_conformer_encoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- modeling_phi4mm.py
- vision_siglip_navit.py
- speech_conformer_encoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/240k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

  lambda i: encoder_checkpoint_wrapper(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Basic Inference

In [None]:
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
speech_prompt = 'Transcribe the audio to text, and then translate the audio to es-ES. Use <sep> as a separator between the original transcript and the translation.'
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
print(f'>>> Prompt\n{prompt}')

>>> Prompt
<|user|><|audio_1|>Transcribe the audio to text, and then translate the audio to es-ES. Use <sep> as a separator between the original transcript and the translation.<|end|><|assistant|>


In [None]:
# Downlowd and open audio file
# audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))

# Process with the model
inputs = processor(text=prompt, audios=[(clips[0], sr)], return_tensors='pt').to('cuda:0')


In [None]:
generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
    num_logits_to_keep=0
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')

>>> Response
Good morning, afternoon and evening. <sep> Buenos días, tardes y noches.


In [None]:
trans[0].content

'Good morning, afternoon and evening.'

In [None]:
content[0]

'Buenos días, buenas tardes, buenas noches.'

In [None]:
response.split('<sep>')[1].strip()

'Buenos días, tardes y noches.'

## Baseline

### RUN

In [None]:
import torch, gc
from tqdm import tqdm                   # barra de progreso opcional

batch_size=2
device = torch.device("cuda:0")
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
speech_prompt = 'Transcribe the audio to text, and then translate the audio to es-ES. Use <sep> as a separator between the original transcript and the translation.'
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
@torch.inference_mode()                      # evita gradientes
def transcribe_folder(folder, prompt=prompt):
    trans,trad, clips, sr = get_audio_segments(folder)

    # ── agrupa los clips en lotes del mismo tamaño aproximado ──
    batch, batch_hyps = [], []
    for c in clips:
        batch.append((c, sr))
        if len(batch) == batch_size or c is clips[-1]:             # ← ajusta batch_size
            inputs = processor(
                text=[prompt] * len(batch),               # uno por clip
                audios=batch, return_tensors="pt", padding=True
            ).to(device, dtype=torch.float16)

            gen_ids = model.generate(
                **inputs,
                max_new_tokens = 1000,
                generation_config = generation_config,
                num_logits_to_keep=0
            )[:, inputs["input_ids"].shape[1]:]           # quita prompt

            # mueve las ids a CPU antes de decodificar → libera VRAM
            batch_hyps.extend(
                processor.batch_decode(
                    gen_ids.cpu(), skip_special_tokens=True,
                    clean_up_tokenization_spaces=False
                )
            )

            # ── liberamos objetos grandes ───────────────────────
            del inputs, gen_ids
            torch.cuda.empty_cache()
            gc.collect()

            batch = []        # empezamos nuevo lote
    aux=[]
    for b in batch_hyps:
      if '<sep>' in b:
        aux.append(b.split('<sep>')[1].strip())
      else:
        aux.append(b)
        print(b)

    return aux, trans, trad

# ───────────────────── bucle principal ───────────────────────────

# hypotesis, transcriptions, traducciones = [], [], []
for f in tqdm(folders[3:]):
    h, t, trad = transcribe_folder(f)
    print(h)
    print(t)
    print(trad)
    print()
    hypotesis.append(h)
    transcriptions.append(t)
    traducciones.append(trad)

  0%|          | 0/2 [00:00<?, ?it/s]

/content/test/606/es
Se encontraron 267 secciones de frases.
Se encontraron 267 transcripciones.
Se encontraron 267 traducciones.
len(clips)=267, sr=16000, clips[0].shape=(17599,)


 50%|█████     | 1/2 [1:05:30<1:05:30, 3930.80s/it]

and in my experience i meet patients with with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with the with

100%|██████████| 2/2 [1:39:47<00:00, 2993.97s/it]

['Así que voy a hablar sobre la importancia de la atención multidisciplinar.', 'Como escucharon, mi nombre es Anne Partridge.', "I'm a medical oncologist at Dana-farber where I focus on breast cancer in young women in particular.", 'Y no puedo enfatizar lo suficiente la importancia de la atención multidisciplinar para todos los pacientes con cáncer de mama, pero particularmente para los pacientes jóvenes.', 'y te daré un ejemplo de por qué de mi propia práctica, así como discutir un poco de la historia y lo que implica', 'Hemos sabido durante décadas que la atención multidisciplinar hace una diferencia.', 'A finales de los años ochenta, el gobierno del Reino Unido encargó un informe, el informe forestal, que proponía nuevos principios de detección de cáncer de mama.', 'Among these principles were that patients with a high clinical index of suspicion for a cancer diagnosis should receive the triple assessment and that included a clinical exam radiologic review of images and once a biops




In [None]:
for h,t, trad in zip(hypotesis, transcriptions, traducciones):
  print(h)
  print(t)
  print(trad)
  print()

['Buenos días, tardes y noches.', 'Este es el operador de la conferencia.', 'La Escuela Europea de Oncología le da la bienvenida a su cuartacientos décima sesión.', 'La sesión de hoy es "El impacto de la cirugía oncológica en los resultados".', 'Tenga en cuenta que esta actividad está acreditada por CMI.', 'Al final de la presentación, al cerrar la ventana de la transmisión web, se le dirigirá a la evaluación y prueba de opción múltiple de CME.', 'Los competidores de mastermind serán invitados a iniciar el cuestionario después de haber completado la evaluación.', 'Como recordatorio, durante la sesión en vivo todos los participantes pueden hacer preguntas en vivo en cualquier momento simplemente presionando el botón relevante en la parte superior de la página.', 'Aquí hay un extracto de la política, que se publica en su totalidad en nuestro sitio web.', 'Las preguntas serán recibidas por el Profesor Ricardo Ordizia del Whiston Hospital, Universidad de Liverpool, Prescott, Reino Unido, q

In [4]:
hypotesis, transcriptions, traducciones = [], [], []
for i in range(0,len(out),3):
    hypotesis.append(out[i])
    transcriptions.append(out[i+1])
    traducciones.append(out[i+2])

### Evaluation

In [6]:
from __future__ import annotations

from itertools import chain
from typing import Callable, Dict, List, Tuple

import sacrebleu
from comet import download_model, load_from_checkpoint

In [7]:


# ────────────────────────────────────────────────────────────────────────────────
#  Cargamos (y cacheamos) el modelo COMET‑22 una sola vez
# ────────────────────────────────────────────────────────────────────────────────
_COMET_MODEL = None
_COMET_PATH = None
_MODEL_NAME = "Unbabel/wmt22-comet-da"


def _get_comet_model(gpus: int = 0):
    """Devuelve un modelo COMET‑22 listo para `predict` (cacheado)."""
    global _COMET_MODEL, _COMET_PATH

    if _COMET_MODEL is None:
        _COMET_PATH = download_model(_MODEL_NAME)  # se guarda en ~/.cache
        _COMET_MODEL = load_from_checkpoint(_COMET_PATH)

    # *Nota*: el parámetro `gpus` se pasa a `.predict` y **no** aquí, pero
    # exponemos el arg para quien quiera forzar CPU en la firma pública.
    return _COMET_MODEL


# ────────────────────────────────────────────────────────────────────────────────
#  Función principal
# ────────────────────────────────────────────────────────────────────────────────

def bleu_comet_by_audio(
    refs_audio: List[List[str]],
    hyps_audio: List[List[str]],
    srcs_audio: List[List[str]],
    transform: Callable[[str], str] = lambda x: x,
    comet_gpus: int = 0,
    comet_batch_size: int = 8,
) -> Tuple[Dict[str, float], List[Dict[str, float]]]:
    """Calcula BLEU y COMET‑22, **global** y **por audio**.

    Parameters
    ----------
    refs_audio, hyps_audio, srcs_audio : list[list[str]]
        Listas anidadas con el mismo nº de audios y segmentos.
    transform : callable
        Función de normalización por frase (identidad por defecto).
    comet_gpus : int
        Nº de GPUs a usar en `model.predict` (0 ⇒ CPU).
    comet_batch_size : int
        Tamaño de lote para COMET (trade‑off velocidad / memoria).

    Returns
    -------
    (global_metrics, per_audio_metrics)
        global_metrics    = {"bleu": float, "comet22": float}
        per_audio_metrics = [
            {"audio_id": i, "bleu": float, "comet22": float},
            ...
        ]
    """

    # ── Comprobaciones básicas ──────────────────────────────────────────────
    assert len(refs_audio) == len(hyps_audio) == len(srcs_audio), (
        "refs, hyps y srcs deben tener la misma longitud (nº audios)"
    )

    per_audio: List[Dict[str, float]] = []

    # ── Recorremos audio por audio ──────────────────────────────────────────
    comet_model = _get_comet_model()

    for idx, (ref_seg, hyp_seg, src_seg) in enumerate(
        zip(refs_audio, hyps_audio, srcs_audio)
    ):
        assert len(ref_seg) == len(hyp_seg) == len(src_seg), (
            f"El audio {idx} contiene diferente nº de segmentos"
        )

        ref_seg = [transform(r) for r in ref_seg]
        hyp_seg = [transform(h) for h in hyp_seg]
        src_seg = [transform(s) for s in src_seg]

        # ── BLEU corpus‑level para el audio ────────────────────────────────
        bleu_score = sacrebleu.corpus_bleu(hyp_seg, [ref_seg]).score

        # ── COMET‑22 ───────────────────────────────────────────────────────
        samples = [  # una entrada por segmento
            {"src": s, "mt": h, "ref": r}
            for s, h, r in zip(src_seg, hyp_seg, ref_seg)
        ]
        comet_out = comet_model.predict(
            samples,
            batch_size=comet_batch_size,
            gpus=comet_gpus,
            progress_bar=False,
        )
        comet_score = comet_out["system_score"]  # media ya calculada

        per_audio.append(
            {
                "audio_id": idx,
                "bleu": bleu_score,
                "comet22": comet_score,
            }
        )

    # ── Métricas globales ──────────────────────────────────────────────────
    refs_all = list(chain.from_iterable(refs_audio))
    hyps_all = list(chain.from_iterable(hyps_audio))
    srcs_all = list(chain.from_iterable(srcs_audio))

    bleu_global = sacrebleu.corpus_bleu(hyps_all, [refs_all]).score

    comet_samples = [
        {"src": s, "mt": h, "ref": r}
        for s, h, r in zip(srcs_all, hyps_all, refs_all)
    ]
    comet_global = comet_model.predict(
        comet_samples,
        batch_size=comet_batch_size,
        gpus=comet_gpus,
        progress_bar=False,
    )["system_score"]

    return {"bleu": bleu_global, "comet22": comet_global}, per_audio





### Results

In [8]:
global_m, per_audio_m = bleu_comet_by_audio(traducciones, hypotesis, transcriptions, comet_gpus=1)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning m

In [9]:
print("── Métricas globales ──")
for k, v in global_m.items():
    print(f"{k:8s}: {v:.2f}")

print("\n── Métricas por audio ──")
for m in per_audio_m:
    print(
        f"Audio {m['audio_id']}: BLEU={m['bleu']:.2f}, COMET-22={m['comet22']:.2f}"
    )

── Métricas globales ──
bleu    : 25.75
comet22 : 0.76

── Métricas por audio ──
Audio 0: BLEU=41.60, COMET-22=0.82
Audio 1: BLEU=22.85, COMET-22=0.77
Audio 2: BLEU=19.66, COMET-22=0.73
Audio 3: BLEU=22.38, COMET-22=0.75
Audio 4: BLEU=20.72, COMET-22=0.76
