# HuBERT Pre-training Pipeline (Colab Pro)

Pipeline completo: clone → datos → chunk index → assign labels → training.

Todo queda en Google Drive para persistencia entre sesiones.

**Prerequisitos:**
- Colab Pro con GPU (A100 o V100)
- Token de HuggingFace con acceso al dataset `MLCommons/unsupervised_peoples_speech`
- Suficiente espacio en Drive (~50+ GB para tars)

## 0. Setup: Drive + Repo + Deps

In [None]:
# Montar Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Directorio base en Drive — todo persistente aquí
DRIVE_BASE = '/content/drive/MyDrive/ups-challenge'
DRIVE_DATA = f'{DRIVE_BASE}/data'
DRIVE_TAR_CACHE = f'{DRIVE_DATA}/tar_cache'
DRIVE_CHECKPOINTS = f'{DRIVE_BASE}/checkpoints'

os.makedirs(DRIVE_DATA, exist_ok=True)
os.makedirs(DRIVE_TAR_CACHE, exist_ok=True)
os.makedirs(DRIVE_CHECKPOINTS, exist_ok=True)

print('Drive dirs ready:')
!ls -la {DRIVE_BASE}/

In [None]:
# Clonar repo (si no está ya clonado)
REPO_DIR = '/content/ups-challenge-baselines'

if not os.path.exists(REPO_DIR):
    !git clone https://github.com/dannersm/ups-challenge-baselines.git {REPO_DIR}
else:
    !cd {REPO_DIR} && git pull

os.chdir(REPO_DIR)
!git log --oneline -5

In [None]:
# Symlinks: data/ y checkpoints/ apuntan a Drive
import os

for local, drive_path in [('data', DRIVE_DATA), ('checkpoints', DRIVE_CHECKPOINTS)]:
    local_path = os.path.join(REPO_DIR, local)
    if os.path.islink(local_path):
        os.unlink(local_path)
    elif os.path.exists(local_path):
        # Si existe un dir real, moverlo a Drive primero
        !mv {local_path}/* {drive_path}/ 2>/dev/null; rm -rf {local_path}
    os.symlink(drive_path, local_path)

print('Symlinks:')
!ls -la data checkpoints

In [None]:
# Instalar dependencias
# Nota: Colab puede tener Python 3.10/3.11, relajamos la restricción
!sed -i 's/requires-python = ">=3.12"/requires-python = ">=3.10"/' pyproject.toml
!pip install -e . 2>&1 | tail -5
!pip install webdataset
print('\n--- Verificación ---')
!python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')"
!python -c "import torchcodec; print(f'torchcodec OK')"
!python -c "import webdataset; print(f'webdataset OK')"
!python -c "import transformers; print(f'transformers {transformers.__version__}')"

In [None]:
# HuggingFace token — ponlo como secret de Colab o pégalo aquí
import os

# Opción 1: Colab Secrets (recomendado)
try:
    from google.colab import userdata
    os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
    print('HF_TOKEN cargado desde Colab Secrets')
except Exception:
    pass

# Opción 2: Manual (descomenta y pega)
# os.environ['HF_TOKEN'] = 'hf_XXXXX'

assert os.environ.get('HF_TOKEN'), 'HF_TOKEN no configurado!'

In [None]:
# Verificar GPU
!nvidia-smi
import torch
print(f'\nGPU: {torch.cuda.get_device_name(0)}')
print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

## 1. Descargar archivos base (JSONL)

Necesitamos `vad_results.jsonl` y `lang_id_results.jsonl` del dataset de HuggingFace.

**Estos archivos son grandes (~varios GB).** Se guardan en Drive para no re-descargar.

In [None]:
import os

HF_TOKEN = os.environ['HF_TOKEN']
BASE_URL = 'https://huggingface.co/datasets/MLCommons/unsupervised_peoples_speech/resolve/main'

# Descargar vad_results.jsonl
vad_path = 'data/vad_results.jsonl'
if not os.path.exists(vad_path):
    print('Descargando vad_results.jsonl (puede tomar varios minutos)...')
    !curl -L -o {vad_path} -H "Authorization:Bearer {HF_TOKEN}" \
        "{BASE_URL}/vad_results.jsonl"
    !ls -lh {vad_path}
else:
    print(f'vad_results.jsonl ya existe ({os.path.getsize(vad_path)/1e9:.2f} GB)')

# lang_id_results.jsonl se descarga automáticamente en el paso de build_lid_index
lid_path = 'data/lang_id_results.jsonl'
if not os.path.exists(lid_path):
    print('Descargando lang_id_results.jsonl...')
    !curl -L -o {lid_path} -H "Authorization:Bearer {HF_TOKEN}" \
        "{BASE_URL}/lang_id_results.jsonl"
    !ls -lh {lid_path}
else:
    print(f'lang_id_results.jsonl ya existe ({os.path.getsize(lid_path)/1e6:.1f} MB)')

## 2. Construir VAD shards

Parsea `vad_results.jsonl` → `data/vad_shards/{NNNNNN}.pkl` (uno por tar).

In [None]:
import os

# Verificar si ya hay shards
shard_dir = 'data/vad_shards'
if os.path.exists(shard_dir) and len(os.listdir(shard_dir)) > 50:
    print(f'VAD shards ya existen: {len(os.listdir(shard_dir))} archivos')
    print('Saltando. Si quieres regenerar, borra data/vad_shards/')
else:
    !python -m ups_challenge.vad_analysis.vad_lookup

## 3. Construir VAD density index

Calcula la densidad de speech por tar → `data/vad_density_index.pkl`.

In [None]:
import os

if os.path.exists('data/vad_density_index.pkl'):
    print('vad_density_index.pkl ya existe, saltando.')
else:
    !python -m ups_challenge.examples.build_vad_density_index \
        --vad_base_dir ./data/vad_shards \
        --output ./data/vad_density_index.pkl

## 4. Construir LID index

Parsea `lang_id_results.jsonl` → `data/lid_index.pkl` + train/test splits.

In [None]:
import os

if os.path.exists('data/lid_index.pkl'):
    print('lid_index.pkl ya existe, saltando.')
else:
    from ups_challenge.dataloaders.build_index import build_lid_index
    build_lid_index('./data/lid_index.pkl', hf_token=os.environ['HF_TOKEN'])

## 5. Construir chunk index (Phase 1)

Selecciona ~100h de chunks de 10s con alta densidad de speech, priorizando idiomas escasos.

Output: `data/chunk_index_100h.pkl`

In [None]:
import os

TOTAL_HOURS = 100  # Ajusta según necesidad

chunk_index_path = f'data/chunk_index_{TOTAL_HOURS}h.pkl'
if os.path.exists(chunk_index_path):
    import pickle
    with open(chunk_index_path, 'rb') as f:
        idx = pickle.load(f)
    print(f'chunk_index ya existe: {len(idx):,} entries ({len(idx)*10/3600:.1f}h)')
    print('Saltando. Si quieres regenerar, borra el archivo.')
else:
    !python -m ups_challenge.examples.build_chunk_index \
        --total_hours {TOTAL_HOURS} \
        --min_vad_ratio 0.5 \
        --min_chunk_density 0.8 \
        --vad_base_dir ./data/vad_shards \
        --vad_density_index ./data/vad_density_index.pkl \
        --lid_index_path ./data/lid_index.pkl \
        --lang_hours_path ./ups_challenge/examples/lang_speech_hours.json \
        --output ./data/chunk_index_{TOTAL_HOURS}h.pkl

## 6. Assign labels (Phase 2)

Descarga tars necesarios, extrae MFCCs, fit k-means (incremental partial_fit), asigna labels.

**Este paso es resumable:** si se cae, al re-ejecutar retoma desde el último checkpoint.

Los tars quedan cacheados en `data/tar_cache/` para re-uso en training.

In [None]:
import os
import glob

# Verificar si ya existe un pretraining index
existing = glob.glob('data/pretraining_index_*.pkl')
if existing:
    print(f'Pretraining index ya existe: {existing}')
    print('Saltando assign_labels. Si quieres regenerar, borra los archivos.')
else:
    # Resumable: si se cae, al re-ejecutar retoma desde el checkpoint
    !python -m ups_challenge.examples.assign_labels \
        --index ./data/chunk_index_{TOTAL_HOURS}h.pkl \
        --n_clusters 100 \
        --output_dir ./data \
        --cache_dir ./data/tar_cache \
        --target_sr 16000 \
        --save_every_tars 5

In [None]:
# Verificar el índice generado
import pickle
import glob

idx_files = sorted(glob.glob('data/pretraining_index_*.pkl'))
for f in idx_files:
    with open(f, 'rb') as fh:
        idx = pickle.load(fh)
    hours = len(idx) * 10 / 3600
    langs = len(set(e.get('language', '?') for e in idx))
    tars = len(set(e['tar_number'] for e in idx))
    print(f'{f}: {len(idx):,} entries, {hours:.1f}h, {langs} langs, {tars} tars')

## 7. HuBERT Pre-training (4 epochs)

**Resumable:** guarda `training_state.pt` cada 500 optimizer steps. Si se cae, al re-ejecutar retoma automáticamente con `--resume`.

**Guía de batch_size por GPU:**
- T4 (16GB): `batch_size=8`, `grad_accum=4` → effective 32
- V100 (16GB): `batch_size=12-16`, `grad_accum=2-4` → effective 32-64
- A100 (40GB): `batch_size=32-48`, `grad_accum=1-2` → effective 32-96
- A100 (80GB): `batch_size=64`, `grad_accum=1` → effective 64

In [None]:
import torch
import glob

# Auto-detectar batch size según VRAM
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
gpu_name = torch.cuda.get_device_name(0)

if vram_gb >= 70:      # A100-80GB
    BATCH_SIZE = 64
    GRAD_ACCUM = 1
elif vram_gb >= 35:    # A100-40GB
    BATCH_SIZE = 32
    GRAD_ACCUM = 1
elif vram_gb >= 14:    # V100/T4
    BATCH_SIZE = 12
    GRAD_ACCUM = 3
else:
    BATCH_SIZE = 8
    GRAD_ACCUM = 4

EFFECTIVE_BATCH = BATCH_SIZE * GRAD_ACCUM
print(f'GPU: {gpu_name} ({vram_gb:.0f} GB VRAM)')
print(f'batch_size={BATCH_SIZE}, grad_accum={GRAD_ACCUM}, effective={EFFECTIVE_BATCH}')

# Encontrar el index más reciente
INDEX_PATH = sorted(glob.glob('data/pretraining_index_*.pkl'))[-1]
INDEX_PATH = 'data/pretraining_index_97h.pkl'
print(f'Index: {INDEX_PATH}')

In [None]:
# Entrenar (resumable — si se cae, re-ejecuta esta celda)
!python -m ups_challenge.examples.hubert_pretraining \
    --index_path {INDEX_PATH} \
    --num_clusters 100 \
    --batch_size {BATCH_SIZE} \
    --grad_accum_steps {GRAD_ACCUM} \
    --num_epochs 4 \
    --learning_rate 5e-5 \
    --warmup_steps 500 \
    --max_grad_norm 1.0 \
    --cache_dir ./data/tar_cache \
    --output_dir ./checkpoints/aligned \
    --save_every_steps 500 \
    --projection_warmup_epochs 1 \
    --projection_lr 5e-4 #\
    #--resume

In [None]:
# Verificar checkpoints guardados en Drive
!ls -lh checkpoints/aligned/

# Mostrar loss curve si existe
import os
if os.path.exists('checkpoints/aligned/loss_curve.png'):
    from IPython.display import Image, display
    display(Image('checkpoints/aligned/loss_curve.png'))

## 8. Espacio en Drive

Utilidades para monitorear y limpiar espacio.

In [None]:
# Espacio usado por componente
!echo '--- tar_cache ---' && du -sh data/tar_cache/ 2>/dev/null
!echo '--- vad_shards ---' && du -sh data/vad_shards/ 2>/dev/null
!echo '--- indices/pkl ---' && du -sh data/*.pkl 2>/dev/null
!echo '--- checkpoints ---' && du -sh checkpoints/ 2>/dev/null
!echo '--- JSONL ---' && ls -lh data/*.jsonl 2>/dev/null
!echo '--- TOTAL ---' && du -sh data/ checkpoints/

In [None]:
# OPCIONAL: Borrar vad_results.jsonl después de generar los shards (ahorra ~GB)
# import os
# if os.path.exists('data/vad_shards') and len(os.listdir('data/vad_shards')) > 50:
#     os.remove('data/vad_results.jsonl')
#     print('vad_results.jsonl borrado (shards ya generados)')