In [None]:

import kfp
from kfp import dsl, kubernetes
from kfp.dsl import Dataset, Input, Output, Model
from typing import NamedTuple


# Step 1: 加载数据并保存为 Dataset Artifact
@dsl.component(
    base_image="ghcr.io/kubeflow/kubeflow/notebook-servers/jupyter-tensorflow-full:v1.10.0",
    packages_to_install=["pandas", "scikit-learn"]
)
def load_data(
        dataset_output: Output[Dataset]
):
    from sklearn.datasets import load_iris
    import pandas as pd

    def check_resource():
        import os
        import subprocess

        # 打印 CPU/内存
        with open('/sys/fs/cgroup/cpu.max', 'r') as f:
            print(f"CPU/GPU Quota Info: {f.read().strip()}")
        with open('/sys/fs/cgroup/memory.max', 'r') as f:
            print(f"Memory Limit Info: {f.read().strip()}")

        # 打印 GPU
        try:
            res = subprocess.check_output(["nvidia-smi", "-L"]).decode("utf-8")
            print(f"GPU:\n{res}")
        except:
            print("GPU limit doesn't set .")
    check_resource()

    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['target'] = iris.target
    df.to_csv(dataset_output.path, index=False)

# Step 2: 训练模型并输出 Model Artifact
@dsl.component(
    base_image="ghcr.io/kubeflow/kubeflow/notebook-servers/jupyter-tensorflow-full:v1.10.0",
    packages_to_install=["scikit-learn", "pandas", "joblib"]
)
def train_model(
        input_dataset: Input[Dataset],
        model_output: Output[Model]  # 新增：定义模型输出
) -> NamedTuple("Output", [("accuracy", float)]):
    import pandas as pd
    import joblib
    import os
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    def check_resource():
        import os
        import subprocess

        # 打印 CPU/内存
        with open('/sys/fs/cgroup/cpu.max', 'r') as f:
            print(f"CPU Quota Info: {f.read().strip()}")
        with open('/sys/fs/cgroup/memory.max', 'r') as f:
            print(f"Memory Limit Info: {f.read().strip()}")

        # 打印 GPU
        try:
            res = subprocess.check_output(["nvidia-smi", "-L"]).decode("utf-8")
            print(f"GPU:\n{res}")
        except:
            print("GPU limit doesn't set .")
    check_resource()

    # 读取数据
    df = pd.read_csv(input_dataset.path)
    X = df.drop(columns=['target'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # 训练模型
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    # 保存模型到指定的 Artifact 路径, 注意路径要加.joblib后缀。 KFP 会自动处理 model_output.path 对应的云端存储位置
    # 确保它是目录
    os.makedirs(model_output.path, exist_ok=True)
    joblib.dump(clf, os.path.join(model_output.path, "model.joblib"))

    # 计算准确率
    y_pred = clf.predict(X_test)
    acc = float(accuracy_score(y_test, y_pred))

    # 设置元数据（可选，能让 UI 显示更丰富的信息）
    model_output.metadata["framework"] = "scikit-learn"
    model_output.metadata["accuracy"] = acc

    print(f"Model accuracy: {acc}")
    return (acc,)

# 在Pipeline定义资源
@dsl.pipeline(
    name="cgpu_setting_node_selector_pipeline",
    description="Full ML pipeline with Dataset and Model artifacts."
)
def cgpu_setting_node_selector_pipeline():
    # 数据加载
    data_task = load_data()
    data_task.set_cpu_limit('1').set_memory_limit('2G')

    # 模型训练
    train_task = train_model(
        input_dataset=data_task.outputs["dataset_output"]
    )
    # 为train_task配置资源
    # CPU资源
    train_task.set_cpu_limit('1').set_memory_limit('4G')
    # GPU资源
    train_task.set_accelerator_limit(2)
    train_task.add_node_selector_constraint('nvidia.com/gpu')
    kubernetes.add_node_selector(
        train_task,
        label_key='nvidia.com/gpu.product',
        label_value='NVIDIA-GeForce-RTX-3090',
    )


# Step 5: Compile
from kfp import compiler

compiler.Compiler().compile(cgpu_setting_node_selector_pipeline, 'cgpu_setting_node_selector_pipeline.yaml')

# Step 6: Run
from kfp.client import Client

client = Client(host='http://ml-pipeline.kubeflow:8888',
                namespace="kubeflow-user-example-com",
                existing_token="ACCESS_TOKEN",
                verify_ssl=False)
run = client.create_run_from_pipeline_package(
    'cgpu_setting_node_selector_pipeline.yaml',
    enable_caching=False,  # 依然有效
    experiment_name='cgpu_setting_node_selector_pipeline-test-experiment'
)
