In [None]:
from typing import Optional, Union, List

from pathlib import Path
import kfp.typing
from kfp import dsl, compiler, kubernetes, client

In [None]:
kaggle_secret = "kaggle-secret"
root = Path("/")
sa = root/Path("var/run/secrets/kubernetes.io/serviceaccount")
ns = open(sa/"namespace", "r").read()
client = client.Client()

In [None]:
@dsl.component(packages_to_install=['kaggle==1.6.14'])
def download_data(competition: str, data_path: Optional[str] = "/data") -> None:
    import os
    import json
    import zipfile
    import subprocess
    
    def init_kaggle() -> None:
        # create the Kaggle config directory
        kaggle_config_dir = os.path.join(
            os.path.expandvars('$HOME'), '.kaggle')
        os.makedirs(kaggle_config_dir, exist_ok = True)

        # write the `kaggle.json` config file
        api_dict = {
            "username": os.environ['KAGGLE_USERNAME'],
            "key":os.environ['KAGGLE_KEY']}
        with open(os.path.join(kaggle_config_dir, "kaggle.json"), "w", encoding='utf-8') as f:
            json.dump(api_dict, f)

        # change `kaggle.json` permissions
        cmd = f"chmod 600 {kaggle_config_dir}/kaggle.json"
        output = subprocess.check_output(cmd.split(" "))
        
    init_kaggle()
    
    import kaggle
    
    # download the competition files
    kaggle.api.competition_download_files(competition, path=data_path)
    with zipfile.ZipFile(os.path.join(data_path, f"{competition}.zip"), 'r') as zip_ref:
        zip_ref.extractall(data_path)

In [None]:
@dsl.component(packages_to_install=["kubeflow-training==1.8.0"])
def launch_training(
    run_name: str,
    namespace: str,
    data_vol: str,
    logs_vol: str,
    image: str,
    image_cmd: Optional[List[str]] = list(),
    image_args: Optional[List[str]] = list(),
    data_mount_path: Optional[str] = "/data",
    logs_mount_path: Optional[str] = "/logs",
) -> None:
    from kubeflow.training import TrainingClient, constants
    from kubernetes.client import (V1ObjectMeta,
                                   V1PodTemplateSpec,
                                   V1PodSpec,
                                   V1Volume,
                                   V1PersistentVolumeClaimVolumeSource,
                                   V1EmptyDirVolumeSource,
                                   V1Container,
                                   V1VolumeMount,
                                   V1ResourceRequirements)
    from kubeflow.training.models import (KubeflowOrgV1PyTorchJob,
                                          KubeflowOrgV1PyTorchJobSpec,
                                          KubeflowOrgV1ReplicaSpec,
                                          KubeflowOrgV1RunPolicy)
    
    training_client = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
    
    # define job's metadata
    pytorch_job_metadata = V1ObjectMeta(name=run_name)
    pytorch_replica_metadata = V1ObjectMeta(
        annotations={"sidecar.istio.io/inject": "false"})
    
    # define volumes
    data_volume = V1Volume(
        name=data_vol,
        persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name=data_vol))
    logs_volume = V1Volume(
        name=logs_vol,
        persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name=logs_vol))
    shm_volume = V1Volume(
        name="dshm",
        empty_dir=V1EmptyDirVolumeSource(medium="Memory", size_limit="2Gi"))
    
    # define volume mounts
    data_volume_mount = V1VolumeMount(
        name=data_vol,
        mount_path=data_mount_path)
    logs_volume_mount = V1VolumeMount(
        name=logs_vol,
        mount_path=logs_mount_path)
    dshm_volume_mount = V1VolumeMount(
        name="dshm",
        mount_path="/dev/shm")
    
    # define job's container
    pytorch_replica_container = V1Container(
        name="pytorch",
        image=image,
        command=image_cmd,
        args=image_args,
        resources=V1ResourceRequirements(
            limits={"nvidia.com/gpu": "1"}),
        volume_mounts=[data_volume_mount, logs_volume_mount, dshm_volume_mount])
    
    # define job's replica spec
    pytorch_replica_template_spec = V1PodSpec(
        volumes=[data_volume, logs_volume, shm_volume],
        containers=[pytorch_replica_container])
    pytorch_replica_template = V1PodTemplateSpec(
        metadata=pytorch_replica_metadata,
        spec=pytorch_replica_template_spec)
    pytorch_replica_spec = KubeflowOrgV1ReplicaSpec(
        replicas=1,
        restart_policy="OnFailure",
        template=pytorch_replica_template)
    pytorch_replica_specs = {
        "Master": pytorch_replica_spec,
        "Worker": pytorch_replica_spec
    }
    
    # define PyTorchJob spec
    pytorch_job_spec = KubeflowOrgV1PyTorchJobSpec(
        pytorch_replica_specs=pytorch_replica_specs,
        run_policy=KubeflowOrgV1RunPolicy())
    
    pytorch_job = KubeflowOrgV1PyTorchJob(
        api_version="kubeflow.org/v1",
        kind="PyTorchJob",
        metadata=pytorch_job_metadata,
        spec=pytorch_job_spec)
    
    training_client.create_job(pytorch_job, namespace=namespace)

In [None]:
@dsl.pipeline
def isic_pipeline(
    namespace: str,
    competition_name: str,
    dist_run_name: str,
    data_vol: str,
    logs_vol: str,
    dist_run_image: str,
    data_path: Optional[str] = "/data",
    dist_image_cmd: Optional[List[str]] = list(),
    dist_image_args: Optional[List[str]] = list(),
    data_mount_path: Optional[str] = "/data",
    logs_mount_path: Optional[str] = "/logs",
) -> None:
    # create a PVC to store the dataset
    isic_data_pvc = kubernetes.CreatePVC(
        pvc_name='isic-data',
        access_modes=['ReadWriteMany'],
        size='8.0Gi',
        storage_class_name='longhorn'
    )
    
    # create a PVC to log the training progress
    isic_logs_pvc = kubernetes.CreatePVC(
        pvc_name='isic-logs',
        access_modes=['ReadWriteMany'],
        size='2.0Gi',
        storage_class_name='longhorn'
    )

    download_data_step = download_data(
        competition=competition_name,
        data_path=data_path).after(isic_data_pvc)
    download_data_step.set_caching_options(enable_caching=True)
    
    launch_training_step = launch_training(
        run_name=dist_run_name,
        namespace=namespace,
        data_vol=data_vol,
        logs_vol=logs_vol,
        image=dist_run_image,
        image_cmd=dist_image_cmd,
        image_args=dist_image_args,
        data_mount_path=data_mount_path,
        logs_mount_path=logs_mount_path).after(download_data_step)
    launch_training_step.set_caching_options(enable_caching=False)

    kubernetes.mount_pvc(
        download_data_step,
        pvc_name=isic_data_pvc.outputs['name'],
        mount_path='/data')
    kubernetes.use_secret_as_env(
        download_data_step,
        secret_name=kaggle_secret,
        secret_key_to_env={'username': 'KAGGLE_USERNAME'})
    kubernetes.use_secret_as_env(
        download_data_step,
        secret_name=kaggle_secret,
        secret_key_to_env={'key': 'KAGGLE_KEY'})

In [None]:
compiler.Compiler().compile(isic_pipeline, package_path='pipeline.yaml')

In [None]:
experiment = client.create_experiment(
    name="isic-experiment",
    description="Skin Cancer Detection with 3D-TBP",
    namespace=ns)

In [None]:
pipeline = client.create_run_from_pipeline_package(
    pipeline_file="pipeline.yaml",
    experiment_name=experiment.display_name,
    namespace=ns,
    run_name="isic-run",
    arguments={
        "namespace": ns,
        "competition_name": "isic-2024-challenge",
        "dist_run_name": "pytorch-dist-isic-efficientnet",
        "data_vol": "isic-data",
        "logs_vol": "isic-logs",
        "dist_run_image": "dpoulopoulos/pytorch-dist-isic:61a89cd",
    },
)   