In [None]:
import numpy as np
import pandas as pd
from wedpr_ml_toolkit.config.wedpr_ml_config import WeDPRMlConfigBuilder
from wedpr_ml_toolkit.wedpr_ml_toolkit import WeDPRMlToolkit
from wedpr_ml_toolkit.toolkit.dataset_toolkit import DatasetToolkit
from wedpr_ml_toolkit.context.data_context import DataContext
from wedpr_ml_toolkit.context.job_context import JobType

In [2]:
# 读取配置文件
wedpr_config = WeDPRMlConfigBuilder.build_from_properties_file('config.properties')
wedpr_ml_toolkit = WeDPRMlToolkit(wedpr_config)

In [None]:
# 注册 dataset1
dataset1 = DatasetToolkit(storage_entrypoint=wedpr_ml_toolkit.get_storage_entry_point(),
                          storage_workspace=wedpr_config.user_config.get_workspace_path(),
                          dataset_owner='flyhuang1',
                          agency=wedpr_config.user_config.agency_name,
                          dataset_id = 'd-9606704699156485',
                          dataset_path="/user/ppc/milestone2/sgd/flyhuang1/d-9606704699156485",
                          is_label_holder=True)
print(dataset1.storage_client.storage_client.endpoint, dataset1.storage_workspace, dataset1.agency)
dataset1.load_values(header=0)
print(dataset1.dataset_path)
print(dataset1.values.head())

In [None]:
# 注册 dataset2
dataset2 = DatasetToolkit(storage_entrypoint=wedpr_ml_toolkit.get_storage_entry_point(), 
                          storage_workspace=wedpr_config.user_config.get_workspace_path(), 
                          dataset_owner='flyhuang',
                          dataset_id = 'd-9606695119693829',
                          dataset_path="/user/ppc/milestone2/webank/flyhuang/d-9606695119693829", 
                          agency="WeBank")
print(dataset2.storage_client.storage_client.endpoint, dataset2.storage_workspace, dataset2.agency)
dataset2.load_values(header=0)
print(dataset2.dataset_path)
print(dataset2.values.head())

In [None]:
# 构建 dataset context
dataset = DataContext(dataset1, dataset2)
print(dataset.datasets)

# init the job context
project_id = "9606702107011078"

# 构造xgb任务配置
model_setting = {'use_psi': 0, 'fillna': 0, 'na_select': 1, 'filloutlier': 0, 'normalized': 0, 'standardized': 0, 'categorical': '', 'psi_select_col': '', 'psi_select_base': '', 'psi_select_thresh': 0.3, 'psi_select_bins': 4, 'corr_select': 0, 'use_iv': 0, 'group_num': 4, 'iv_thresh': 0.1, 'use_goss': 0, 'test_dataset_percentage': 0.3, 'learning_rate': 0.1, 'num_trees': 6, 'max_depth': 3, 'max_bin': 4, 'silent': 0, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'reg_alpha': 0, 'reg_lambda': 1, 'gamma': 0, 'min_child_weight': 0, 'min_child_samples': 10, 'seed': 2024, 'early_stopping_rounds': 0, 'eval_metric': 'auc', 'verbose_eval': 1, 'eval_set_column': '', 'train_set_value': '', 'eval_set_value': '', 'train_features': ''}

xgb_job_context = wedpr_ml_toolkit.build_job_context(
    JobType.XGB_TRAINING, project_id, dataset, model_setting, "id")
print(xgb_job_context.participant_id_list, xgb_job_context.result_receiver_id_list)
print(xgb_job_context.project_id)

xgb_job_param = xgb_job_context.build()
print(xgb_job_param.taskParties)
print(xgb_job_param.datasetList)
print(xgb_job_param.job)
# import json
# print(json.dumps(xgb_job_param.__dict__))

In [None]:
# 执行xgb任务
# xgb_job_id = '9707983191943174'  # 测试时跳过创建新任务过程
xgb_job_id = xgb_job_context.submit()
print(xgb_job_id)

In [None]:
# 获取xgb任务结果
# xgb_job_id = '9707983191943174'  # 测试时跳过创建新任务过程
print(xgb_job_id)
xgb_result = xgb_job_context.parse_result(xgb_job_id, True)
xgb_result.train_result.load_values(header = 0)
xgb_result.test_result.load_values(header = 0)
print(xgb_result.train_result.values.head())
print(xgb_result.test_result.values.head())

In [None]:
# 明文处理预测结果
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt

data = xgb_result.test_result.values

# 提取真实标签和预测概率
y_true = data['class_label']
y_pred_proba = data['class_pred']
y_pred = np.where(y_pred_proba >= 0.5, 1, 0)  # 二分类阈值设为0.5

# 计算评估指标
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred_proba)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC: {auc:.2f}")

# ROC 曲线
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
plt.figure(figsize=(12, 5))

# ROC 曲线
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

# 精确率-召回率曲线
precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_pred_proba)
plt.subplot(1, 2, 2)
plt.plot(recall_vals, precision_vals)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')

plt.tight_layout()
plt.show()

In [None]:
# 构造xgb预测任务配置
model_setting = {'use_psi': 0, 'use_iv': 0}

xgb_job_context = wedpr_ml_toolkit.build_job_context(
    JobType.XGB_PREDICTING, project_id, dataset, model_setting, "id", xgb_result.model)
print(xgb_job_context.participant_id_list, xgb_job_context.result_receiver_id_list)
print(xgb_job_context.project_id)

xgb_job_param = xgb_job_context.build()
print(xgb_job_param.taskParties)
print(xgb_job_param.datasetList)
# import json
# print(json.dumps(xgb_job_param.__dict__))

In [None]:
# 执行xgb预测任务
# xgb_job_id = '9708824062994438'  # 测试时跳过创建新任务过程
xgb_job_id = xgb_job_context.submit()
print(xgb_job_id)

In [None]:
# 获取xgb预测任务结果
# xgb_job_id = '9708824062994438'  # 测试时跳过创建新任务过程
print(xgb_job_id)
xgb_result = xgb_job_context.parse_result(xgb_job_id, True)
xgb_result.test_result.load_values(header = 0)
print(xgb_result.test_result.values.head())

In [None]:
# 明文处理预测结果
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt

data = xgb_result.test_result.values

# 提取真实标签和预测概率
y_true = data['class_label']
y_pred_proba = data['class_pred']
y_pred = np.where(y_pred_proba >= 0.5, 1, 0)  # 二分类阈值设为0.5

# 计算评估指标
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred_proba)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC: {auc:.2f}")

# ROC 曲线
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
plt.figure(figsize=(12, 5))

# ROC 曲线
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

# 精确率-召回率曲线
precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_pred_proba)
plt.subplot(1, 2, 2)
plt.plot(recall_vals, precision_vals)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')

plt.tight_layout()
plt.show()