In [1]:
# List all available Kubeflow Training Runtimes.
from kubeflow.trainer import *
from kubeflow_trainer_api import models

client = TrainerClient()
for runtime in client.list_runtimes():
    print(runtime)

Runtime mpi-distributed must have trainer.kubeflow.org/framework label.
Runtime torchtune-llama3.2-1b must have trainer.kubeflow.org/framework label.
Runtime torchtune-llama3.2-3b must have trainer.kubeflow.org/framework label.


Runtime(name='deepspeed-distributed', trainer=RuntimeTrainer(trainer_type=<TrainerType.CUSTOM_TRAINER: 'CustomTrainer'>, framework='deepspeed', num_nodes=1, device='Unknown', device_count='1'), pretrained_model=None)
Runtime(name='mlx-distributed', trainer=RuntimeTrainer(trainer_type=<TrainerType.CUSTOM_TRAINER: 'CustomTrainer'>, framework='mlx', num_nodes=1, device='Unknown', device_count='1'), pretrained_model=None)
Runtime(name='torch-distributed', trainer=RuntimeTrainer(trainer_type=<TrainerType.CUSTOM_TRAINER: 'CustomTrainer'>, framework='torch', num_nodes=1, device='Unknown', device_count='Unknown'), pretrained_model=None)
Runtime(name='torchtune-qwen2.5-1.5b', trainer=RuntimeTrainer(trainer_type=<TrainerType.BUILTIN_TRAINER: 'BuiltinTrainer'>, framework='torchtune', num_nodes=1, device='gpu', device_count='2.0'), pretrained_model=None)


In [2]:
# Create a PersistentVolumeClaim for the TorchTune Qwen 2.5 1.5B model.
client.backend.core_api.create_namespaced_persistent_volume_claim(
    namespace="default",
    body=models.IoK8sApiCoreV1PersistentVolumeClaim(
        apiVersion="v1",
        kind="PersistentVolumeClaim",
        metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(
            name="torchtune-qwen2.5-1.5b"
        ),
        spec=models.IoK8sApiCoreV1PersistentVolumeClaimSpec(
            accessModes=["ReadWriteOnce"],
            resources=models.IoK8sApiCoreV1VolumeResourceRequirements(
                requests={
                    "storage": models.IoK8sApimachineryPkgApiResourceQuantity("20Gi")
                }
            ),
        ),
    ).to_dict(),
)

{'api_version': 'v1',
 'kind': 'PersistentVolumeClaim',
 'metadata': {'annotations': None,
              'creation_timestamp': datetime.datetime(2025, 11, 7, 18, 47, 23, tzinfo=tzutc()),
              'deletion_grace_period_seconds': None,
              'deletion_timestamp': None,
              'finalizers': ['kubernetes.io/pvc-protection'],
              'generate_name': None,
              'generation': None,
              'labels': None,
              'managed_fields': [{'api_version': 'v1',
                                  'fields_type': 'FieldsV1',
                                  'fields_v1': {'f:spec': {'f:accessModes': {},
                                                           'f:resources': {'f:requests': {'.': {},
                                                                                          'f:storage': {}}},
                                                           'f:volumeMode': {}}},
                                  'manager': 'OpenAPI-Generator',
    

In [3]:
job_name = client.train(
    runtime=client.get_runtime(name="torchtune-qwen2.5-1.5b"),
    initializer=Initializer(
        dataset=HuggingFaceDatasetInitializer(
            storage_uri="hf://tatsu-lab/alpaca/data"
        ),
        model=HuggingFaceModelInitializer(
            storage_uri="hf://Qwen/Qwen2.5-1.5B-Instruct",
        )
    ),
    trainer=BuiltinTrainer(
        config=TorchTuneConfig(
            dataset_preprocess_config=TorchTuneInstructDataset(
                source=DataFormat.PARQUET, split="train[:1000]"
            ),
            resources_per_node={
                "memory": "4G",
                "gpu": 1,
            },
            
        )
    )
)

In [4]:
client.wait_for_job_status(name=job_name, status={"Running"})

TrainJob(name='s26e057fe396', creation_timestamp=datetime.datetime(2025, 11, 7, 18, 48, 36, tzinfo=TzInfo(0)), runtime=Runtime(name='torchtune-qwen2.5-1.5b', trainer=RuntimeTrainer(trainer_type=<TrainerType.BUILTIN_TRAINER: 'BuiltinTrainer'>, framework='torchtune', num_nodes=1, device='gpu', device_count='2'), pretrained_model=None), steps=[Step(name='dataset-initializer', status='Succeeded', pod_name='s26e057fe396-dataset-initializer-0-0-46zs5', device='Unknown', device_count='Unknown'), Step(name='model-initializer', status='Succeeded', pod_name='s26e057fe396-model-initializer-0-0-5z4pl', device='Unknown', device_count='Unknown'), Step(name='node-0', status='Running', pod_name='s26e057fe396-node-0-0-xnmlv', device='gpu', device_count='1')], num_nodes=1, status='Running')

In [6]:
client.delete_job(name=job_name)

RuntimeError: Failed to delete TrainJob: default/s26e057fe396