# HuBERT Pre-training Pipeline (Colab Pro)

Full pipeline: clone → data → chunk index → assign labels → training.

Everything is stored on Google Drive for persistence across sessions.

**Prerequisites:**
- Colab Pro with GPU (A100 or V100)
- HuggingFace token with access to the `MLCommons/unsupervised_peoples_speech` dataset
- Enough Drive space (~50+ GB for tars)

## 0. Setup: Drive + Repo + Deps

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Base directory on Drive — everything persists here
DRIVE_BASE = '/content/drive/MyDrive/ups-challenge'
DRIVE_DATA = f'{DRIVE_BASE}/data'
DRIVE_TAR_CACHE = f'{DRIVE_DATA}/tar_cache'
DRIVE_CHECKPOINTS = f'{DRIVE_BASE}/checkpoints'

os.makedirs(DRIVE_DATA, exist_ok=True)
os.makedirs(DRIVE_TAR_CACHE, exist_ok=True)
os.makedirs(DRIVE_CHECKPOINTS, exist_ok=True)

print('Drive dirs ready:')
!ls -la {DRIVE_BASE}/

In [None]:
# Clone repo (if not already cloned)
REPO_DIR = '/content/ups-hubert-continuous-pretraining'

if not os.path.exists(REPO_DIR):
    !git clone https://github.com/dannersm/ups-hubert-continuous-pretraining.git {REPO_DIR}
else:
    !cd {REPO_DIR} && git pull

os.chdir(REPO_DIR)
!git log --oneline -5

In [None]:
# Symlinks: data/ and checkpoints/ point to Drive
import os

for local, drive_path in [('data', DRIVE_DATA), ('checkpoints', DRIVE_CHECKPOINTS)]:
    local_path = os.path.join(REPO_DIR, local)
    if os.path.islink(local_path):
        os.unlink(local_path)
    elif os.path.exists(local_path):
        # If a real dir exists, move it to Drive first
        !mv {local_path}/* {drive_path}/ 2>/dev/null; rm -rf {local_path}
    os.symlink(drive_path, local_path)

print('Symlinks:')
!ls -la data checkpoints

In [None]:
# Install dependencies
# Note: Colab may have Python 3.10/3.11, so we relax the version requirement
!sed -i 's/requires-python = ">=3.12"/requires-python = ">=3.10"/' pyproject.toml
!pip install -e . 2>&1 | tail -5
!pip install webdataset
print('\n--- Verification ---')
!python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')"
!python -c "import torchcodec; print(f'torchcodec OK')"
!python -c "import webdataset; print(f'webdataset OK')"
!python -c "import transformers; print(f'transformers {transformers.__version__}')"

In [None]:
# HuggingFace token — set it as a Colab secret or paste it here
import os

# Option 1: Colab Secrets (recommended)
try:
    from google.colab import userdata
    os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
    print('HF_TOKEN loaded from Colab Secrets')
except Exception:
    pass

# Option 2: Manual (uncomment and paste)
# os.environ['HF_TOKEN'] = 'hf_XXXXX'

assert os.environ.get('HF_TOKEN'), 'HF_TOKEN not configured!'

In [None]:
# Check GPU
!nvidia-smi
import torch
print(f'\nGPU: {torch.cuda.get_device_name(0)}')
print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

## 1. Download base files (JSONL)

We need `vad_results.jsonl` and `lang_id_results.jsonl` from the HuggingFace dataset.

**These files are large (~several GB).** They are saved on Drive to avoid re-downloading.

In [None]:
import os

HF_TOKEN = os.environ['HF_TOKEN']
BASE_URL = 'https://huggingface.co/datasets/MLCommons/unsupervised_peoples_speech/resolve/main'

# Download vad_results.jsonl
vad_path = 'data/vad_results.jsonl'
if not os.path.exists(vad_path):
    print('Downloading vad_results.jsonl (may take several minutes)...')
    !curl -L -o {vad_path} -H "Authorization:Bearer {HF_TOKEN}" \
        "{BASE_URL}/vad_results.jsonl"
    !ls -lh {vad_path}
else:
    print(f'vad_results.jsonl already exists ({os.path.getsize(vad_path)/1e9:.2f} GB)')

# lang_id_results.jsonl is downloaded automatically in the build_lid_index step
lid_path = 'data/lang_id_results.jsonl'
if not os.path.exists(lid_path):
    print('Downloading lang_id_results.jsonl...')
    !curl -L -o {lid_path} -H "Authorization:Bearer {HF_TOKEN}" \
        "{BASE_URL}/lang_id_results.jsonl"
    !ls -lh {lid_path}
else:
    print(f'lang_id_results.jsonl already exists ({os.path.getsize(lid_path)/1e6:.1f} MB)')

## 2. Build VAD shards

Parses `vad_results.jsonl` → `data/vad_shards/{NNNNNN}.pkl` (one per tar).

In [None]:
import os

# Check if shards already exist
shard_dir = 'data/vad_shards'
if os.path.exists(shard_dir) and len(os.listdir(shard_dir)) > 50:
    print(f'VAD shards already exist: {len(os.listdir(shard_dir))} files')
    print('Skipping. To regenerate, delete data/vad_shards/')
else:
    !python -m ups_challenge.preprocessing.vad_lookup

## 3. Build VAD density index

Computes speech density per tar → `data/vad_density_index.pkl`.

In [None]:
import os

if os.path.exists('data/vad_density_index.pkl'):
    print('vad_density_index.pkl already exists, skipping.')
else:
    !python -m ups_challenge.preprocessing.build_vad_density_index \
        --vad_base_dir ./data/vad_shards \
        --output ./data/vad_density_index.pkl

## 4. Build LID index

Parses `lang_id_results.jsonl` → `data/lid_index.pkl` + train/test splits.

In [None]:
import os

if os.path.exists('data/lid_index.pkl'):
    print('lid_index.pkl already exists, skipping.')
else:
    from ups_challenge.preprocessing.build_lang_index import build_lid_index
    build_lid_index('./data/lid_index.pkl', hf_token=os.environ['HF_TOKEN'])

## 5. Build chunk index (Phase 1)

Selects ~100h of 10-second chunks with high speech density, prioritizing scarce languages.

Output: `data/chunk_index_100h.pkl`

In [None]:
import os

TOTAL_HOURS = 100  # Adjust as needed

chunk_index_path = f'data/chunk_index_{TOTAL_HOURS}h.pkl'
if os.path.exists(chunk_index_path):
    import pickle
    with open(chunk_index_path, 'rb') as f:
        idx = pickle.load(f)
    print(f'chunk_index already exists: {len(idx):,} entries ({len(idx)*10/3600:.1f}h)')
    print('Skipping. To regenerate, delete the file.')
else:
    !python -m ups_challenge.preprocessing.build_chunk_index \
        --total_hours {TOTAL_HOURS} \
        --min_vad_ratio 0.5 \
        --min_chunk_density 0.8 \
        --vad_base_dir ./data/vad_shards \
        --vad_density_index ./data/vad_density_index.pkl \
        --lid_index_path ./data/lid_index.pkl \
        --lang_hours_path ./ups_challenge/preprocessing/lang_speech_hours.json \
        --output ./data/chunk_index_{TOTAL_HOURS}h.pkl

## 6. Assign labels (Phase 2)

Downloads necessary tars, extracts MFCCs, fits k-means (incremental partial_fit), and assigns labels.

**This step is resumable:** if it crashes, re-running resumes from the last checkpoint.

The tars are cached in `data/tar_cache/` for reuse during training.

In [None]:
import os
import glob

# Check if a pretraining index already exists
existing = glob.glob('data/pretraining_index_*.pkl')
if existing:
    print(f'Pretraining index already exists: {existing}')
    print('Skipping assign_labels. To regenerate, delete the files.')
else:
    # Resumable: if it crashes, re-running resumes from the checkpoint
    !python -m ups_challenge.inference.assign_labels \
        --index ./data/chunk_index_{TOTAL_HOURS}h.pkl \
        --n_clusters 100 \
        --output_dir ./data \
        --cache_dir ./data/tar_cache \
        --target_sr 16000 \
        --save_every_tars 5

In [None]:
# Verify the generated index
import pickle
import glob

idx_files = sorted(glob.glob('data/pretraining_index_*.pkl'))
for f in idx_files:
    with open(f, 'rb') as fh:
        idx = pickle.load(fh)
    hours = len(idx) * 10 / 3600
    langs = len(set(e.get('language', '?') for e in idx))
    tars = len(set(e['tar_number'] for e in idx))
    print(f'{f}: {len(idx):,} entries, {hours:.1f}h, {langs} langs, {tars} tars')

## 7. HuBERT Pre-training (4 epochs)

**Resumable at epoch level:** saves `training_state.pt` at the end of each epoch. If Colab crashes, re-run with `--resume` to continue from the last completed epoch.

**Batch size guide by GPU:**
- T4 (16GB): `batch_size=8`, `grad_accum=4` → effective 32
- V100 (16GB): `batch_size=12-16`, `grad_accum=2-4` → effective 32-64
- A100 (40GB): `batch_size=32-48`, `grad_accum=1-2` → effective 32-96
- A100 (80GB): `batch_size=64`, `grad_accum=1` → effective 64

In [None]:
import torch
import glob

# Auto-detect batch size based on VRAM
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
gpu_name = torch.cuda.get_device_name(0)

if vram_gb >= 70:      # A100-80GB
    BATCH_SIZE = 64
    GRAD_ACCUM = 1
elif vram_gb >= 35:    # A100-40GB
    BATCH_SIZE = 32
    GRAD_ACCUM = 1
elif vram_gb >= 14:    # V100/T4
    BATCH_SIZE = 12
    GRAD_ACCUM = 3
else:
    BATCH_SIZE = 8
    GRAD_ACCUM = 4

EFFECTIVE_BATCH = BATCH_SIZE * GRAD_ACCUM
print(f'GPU: {gpu_name} ({vram_gb:.0f} GB VRAM)')
print(f'batch_size={BATCH_SIZE}, grad_accum={GRAD_ACCUM}, effective={EFFECTIVE_BATCH}')

# Find the most recent index
INDEX_PATH = sorted(glob.glob('data/pretraining_index_*.pkl'))[-1]
INDEX_PATH = 'data/pretraining_index_485h.pkl'
print(f'Index: {INDEX_PATH}')

In [None]:
# Train (resumable — if it crashes, re-run this cell with --resume)
!python -m ups_challenge.inference.hubert_pretraining \
    --index_path {INDEX_PATH} \
    --num_clusters 100 \
    --batch_size {BATCH_SIZE} \
    --grad_accum_steps {GRAD_ACCUM} \
    --num_epochs 4 \
    --learning_rate 5e-5 \
    --warmup_steps 500 \
    --max_grad_norm 1.0 \
    --cache_dir ./data/tar_cache \
    --output_dir ./checkpoints/aligned \
    --projection_warmup_epochs 1 \
    --projection_lr 5e-4 \
    --resume

In [None]:
# Check checkpoints saved on Drive
!ls -lh checkpoints/aligned/

# Show loss curve if it exists
import os
if os.path.exists('checkpoints/aligned/loss_curve.png'):
    from IPython.display import Image, display
    display(Image('checkpoints/aligned/loss_curve.png'))

## 8. Drive space

Utilities for monitoring and cleaning up space.

In [None]:
# Space used by component
!echo '--- tar_cache ---' && du -sh data/tar_cache/ 2>/dev/null
!echo '--- vad_shards ---' && du -sh data/vad_shards/ 2>/dev/null
!echo '--- indices/pkl ---' && du -sh data/*.pkl 2>/dev/null
!echo '--- checkpoints ---' && du -sh checkpoints/ 2>/dev/null
!echo '--- JSONL ---' && ls -lh data/*.jsonl 2>/dev/null
!echo '--- TOTAL ---' && du -sh data/ checkpoints/

In [None]:
# OPTIONAL: Delete vad_results.jsonl after generating the shards (saves ~GB)
# import os
# if os.path.exists('data/vad_shards') and len(os.listdir('data/vad_shards')) > 50:
#     os.remove('data/vad_results.jsonl')
#     print('vad_results.jsonl deleted (shards already generated)')