In [1]:
%pip install google-cloud-aiplatform transformers[torch] datasets torchaudio soundfile 

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os

# Set BEFORE any torch import
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer, TrainerCallback
import numpy as np
import torch
from google.cloud import storage
from dataclasses import dataclass
from typing import Any, Dict, List
import gc

class MemoryMonitorCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 5 == 0:
            allocated = torch.cuda.memory_allocated() / 1024**3
            reserved = torch.cuda.memory_reserved() / 1024**3
            print(f"Step {state.global_step}: Allocated={allocated:.2f}GB, Reserved={reserved:.2f}GB")

@dataclass
class AudioDataCollator:
    """Custom data collator for audio token sequences with dynamic padding."""
    tokenizer: Any
    pad_token_id: int = 128263
    
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        max_length = max(len(f["input_ids"]) for f in features)
        
        batch_input_ids = []
        batch_labels = []
        batch_attention_mask = []
        
        for feature in features:
            input_ids = feature["input_ids"]
            labels = feature.get("labels", input_ids)
            padding_length = max_length - len(input_ids)
            
            padded_input_ids = input_ids + [self.pad_token_id] * padding_length
            batch_input_ids.append(padded_input_ids)
            
            padded_labels = labels + [-100] * padding_length
            batch_labels.append(padded_labels)
            
            attention_mask = [1] * len(input_ids) + [0] * padding_length
            batch_attention_mask.append(attention_mask)
        
        return {
            "input_ids": torch.tensor(batch_input_ids, dtype=torch.long),
            "labels": torch.tensor(batch_labels, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention_mask, dtype=torch.long),
        }


def main():
    # Clear memory at start
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        gc.collect()
        print(f"Starting GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    
    model_dir = os.environ.get("AIP_MODEL_DIR", "/tmp/model")
    checkpoint_dir = os.environ.get("AIP_CHECKPOINT_DIR", "/tmp/checkpoints")

    dsn = "cubbk/audio_swedish_2_dataset_cleaned"
    model_name = "canopylabs/orpheus-tts-0.1-pretrained"

    # Optimized settings
    epochs = 3
    batch_size = 24
    pad_token = 128263
    save_steps = 1000
    learning_rate = 5.0e-5

    bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    dtype = torch.bfloat16 if bf16_supported else torch.float32
    if not bf16_supported:
        print("bfloat16 not supported on this device; using float32.")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = pad_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=dtype,
    )
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()

    raw_ds = load_dataset(dsn, split="train", data_dir="8sidor_tokenized")
    raw_ds = raw_ds.select(range(1000))
    
    # Filter long sequences
    raw_ds = raw_ds.filter(lambda x: len(x["input_ids"]) <= 1000)
    print(f"Dataset: {len(raw_ds)} samples")

    split = raw_ds.train_test_split(test_size=0.05, seed=42)
    train_ds, eval_ds = split["train"], split["test"]

    data_collator = AudioDataCollator(
        tokenizer=tokenizer,
        pad_token_id=pad_token
    )

    training_args = TrainingArguments(
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=10,
        bf16=bf16_supported,
        output_dir=checkpoint_dir,
        save_steps=save_steps,
        remove_unused_columns=False,
        learning_rate=learning_rate,
        save_total_limit=1,  # Reduced from 2 to save memory
        logging_dir=f"{checkpoint_dir}/logs",
        warmup_ratio=0.1,
        gradient_checkpointing=True,
        max_grad_norm=1.0,
        dataloader_pin_memory=False,  # Disable pin memory
        dataloader_num_workers=0,  # Disable multiprocessing
        logging_first_step=True,
        logging_nan_inf_filter=False,  # Reduce logging overhead
    )

    # Use custom trainer with memory management
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        data_collator=data_collator,
        callbacks=[MemoryMonitorCallback()],
    )

    print("Starting training...")
    trainer.train()
    
    trainer.save_model(model_dir)
    tokenizer.save_pretrained(model_dir)

    print(f"Training completed. Model saved to {model_dir}")


if __name__ == "__main__":
    main()

Starting GPU Memory: 0.00 GB


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Dataset: 990 samples


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...


Step,Training Loss
1,5.615
10,5.4529


Step 5: Allocated=18.46GB, Reserved=78.40GB
Step 10: Allocated=18.46GB, Reserved=78.40GB
Step 15: Allocated=18.46GB, Reserved=77.60GB


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.39 GiB. GPU 0 has a total capacity of 79.14 GiB of which 17.33 GiB is free. Process 667108 has 61.80 GiB memory in use. Of the allocated memory 61.03 GiB is allocated by PyTorch, and 269.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

: 

In [2]:
!nvidia-smi

Wed Nov  5 14:54:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000000:00:05.0 Off |                    0 |
| N/A   69C    P0             99W /  300W |   32871MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                