In [None]:
import xgboost as xgb
import pandas as pd
from secretflow.utils.simulation.datasets import dataset
from secretflow.stats.score_card import ScoreCard
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report,
    roc_auc_score,
)
import seaborn as sns
from secretflow.ml.boost.homo_boost import SFXgboost
import tempfile
import secretflow as sf
from secretflow.data.horizontal import read_csv as h_read_csv
from secretflow.security.aggregation import SecureAggregator
from secretflow.security.compare import SPUComparator
import time


data_df = pd.read_csv(dataset("creditcard"), sep=",")
print(
    "Credit Card Fraud Detection data -  rows:",
    data_df.shape[0],
    " columns:",
    data_df.shape[1],
)

# 数据探索

In [None]:
data_df.head()

In [None]:
data_df.describe()

In [None]:
total = data_df.isnull().sum().sort_values(ascending=False)
percent = (data_df.isnull().sum() / data_df.isnull().count() * 100).sort_values(
    ascending=False
)
pd.concat([total, percent], axis=1, keys=["Total", "Percent"]).transpose()

In [None]:
temp = data_df["Class"].value_counts()
new_df = pd.DataFrame({"Class": temp.index, "values": temp.values})
new_df

In [None]:
tmp = data_df[["Amount", "Class"]].copy()
class_0 = tmp.loc[tmp["Class"] == 0]["Amount"]
class_1 = tmp.loc[tmp["Class"] == 1]["Amount"]
class_0.describe()

In [None]:
class_1.describe()

In [None]:
sf_df = data_df.copy()
sf_df

# 准备数据

In [None]:
half = len(sf_df) // 2
h_alice = sf_df.iloc[:half]
h_bob = sf_df.iloc[half:]
# Save to temorary files.
_, h_alice_path = tempfile.mkstemp()
_, h_bob_path = tempfile.mkstemp()
h_alice.head(), h_bob.head()

In [None]:
from sklearn.model_selection import train_test_split
atrain_df, atest_df = train_test_split(
    h_alice, test_size=0.2, random_state=1234, shuffle=True
)
atrain_df, avalid_df = train_test_split(
    atrain_df, test_size=0.2, random_state=1234, shuffle=True
)
avalid_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

label_encoder = LabelEncoder()
scaler = StandardScaler()
df_a = atrain_df.apply(
    lambda x: label_encoder.fit_transform(x) if x.dtype == "O" else x
)
trainup = pd.DataFrame(scaler.fit_transform(df_a), columns=df_a.columns)
cov_matrix = abs(trainup.cov())
plt.figure(figsize=(20, 20))
sns.heatmap(cov_matrix, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Covariance Matrix Heatmap a")
plt.show()

df_g = data_df.apply(lambda x: label_encoder.fit_transform(x) if x.dtype == "O" else x)
trainup = pd.DataFrame(scaler.fit_transform(df_g), columns=df_g.columns)
cov_matrix = abs(trainup.cov())
plt.figure(figsize=(20, 20))
sns.heatmap(cov_matrix, annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Covariance Matrix Heatmap global")
plt.show()

plt.figure(figsize=(20, 20))
plt.title("Credit Card Transactions features correlation plot (Pearson)")
corr = data_df.corr()
sns.heatmap(
    corr,
    xticklabels=corr.columns,
    yticklabels=corr.columns,
    annot=True,
    cmap="Greens",
    fmt=".2f",
    linewidths=0.5,
)
plt.show()

In [None]:
btrain_df, btest_df = train_test_split(
    h_bob, test_size=0.2, random_state=1234, shuffle=True
)
btrain_df, bvalid_df = train_test_split(
    btrain_df, test_size=0.2, random_state=1234, shuffle=True
)
bvalid_df.head()

In [None]:
atrain_df.to_csv(h_alice_path, index=False)
btrain_df.to_csv(h_bob_path, index=False)

# 单方模型训练

In [None]:
target = "Class"
predictors = [
    "Time",
    "V1",
    "V2",
    "V3",
    "V4",
    "V5",
    "V6",
    "V7",
    "V8",
    "V9",
    "V10",
    "V11",
    "V12",
    "V13",
    "V14",
    "V15",
    "V16",
    "V17",
    "V18",
    "V19",
    "V20",
    "V21",
    "V22",
    "V23",
    "V24",
    "V25",
    "V26",
    "V27",
    "V28",
    "Amount",
]

In [None]:
dtrain = xgb.DMatrix(atrain_df[predictors], atrain_df[target].values)
dvalid = xgb.DMatrix(avalid_df[predictors], avalid_df[target].values)
dtest = xgb.DMatrix(atest_df[predictors], atest_df[target].values)
watchlist = [(dtrain, "train"), (dvalid, "valid")]

In [None]:
params = {
    # XGBoost parameter tutorial
    # https://xgboost.readthedocs.io/en/latest/parameter.html
    "max_depth": 6,  # max depth
    "eta": 0.3,  # learning rate
    "objective": "binary:logistic",  # objection function，support "binary:logistic","reg:logistic","multi:softmax","multi:softprob","reg:squarederror"
    "max_bin": 64,  # Max num of binning
    "subsample": 0.8,  # Subsample rate by rows
    "colsample_bytree": 0.9,  # Feature selection rate by tree
    "eval_metric": "auc",
}
num_round = 20
model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=5)

In [None]:
preds =model.predict(dtest)
y_pred_prob = preds  # 获得预测的概率
y_pred = np.where(y_pred_prob > 0.5, 1, 0)  # 根据阈值将概率转换为标签

# Step 2: 生成混淆矩阵
cm = confusion_matrix(atest_df[target].values , y_pred)

# Step 3: 可视化混淆矩阵
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'])
plt.ylabel('actual')
plt.xlabel('predict')
plt.title('confuse matrix')
plt.show()

In [None]:
fig, (ax) = plt.subplots(ncols=1, figsize=(8, 5))
xgb.plot_importance(
    model, height=0.8, title="Features importance (XGBoost)", ax=ax, color="green"
)
plt.show()

# 准备多方

In [None]:
# Check the version of your SecretFlow
print("The version of SecretFlow: {}".format(sf.__version__))

# In case you have a running secretflow runtime already.
sf.shutdown()

sf.init(["alice", "bob", "charlie"], address="local")
alice, bob, charlie = sf.PYU("alice"), sf.PYU("bob"), sf.PYU("charlie")

In [None]:
aggr = SecureAggregator(device=charlie, participants=[alice, bob])

spu = sf.SPU(sf.utils.testing.cluster_def(parties=["alice", "bob"]))
comp = SPUComparator(spu)
hdf = h_read_csv(
    {alice: h_alice_path, bob: h_bob_path},
    aggregator=aggr,
    comparator=comp,
)
hdf.columns

In [None]:
hdf.count()

# 数据预处理

In [None]:
print('Horizontal df:\n', hdf.min())
print('Horizontal df:\n', hdf.max())
print('Horizontal df:\n', hdf.mean())

In [None]:
from secretflow.preprocessing import StandardScaler,MinMaxScaler
scaler_min = MinMaxScaler()
scaler_std = StandardScaler()

scaled_time = scaler_min.fit_transform(hdf['Time'])
hdf['Time']=scaled_time
scaled_time = scaler_std.fit_transform(hdf['Time'])
hdf['Time']=scaled_time

print('Min: ', hdf['Time'].min())
print('Max: ', hdf['Time'].max())

scaled_amount = scaler_std.fit_transform(hdf['Amount'])
hdf['Amount']=scaled_amount

print('Min: ', hdf['Amount'].min())
print('Max: ', hdf['Amount'].max())


In [None]:
bst = SFXgboost(server=charlie, clients=[alice, bob])
params = {
    "max_depth": 6,  # max depth
    "eta": 0.3,  # learning rate
    "objective": "binary:logistic",  # objection function，support "binary:logistic","reg:logistic","multi:softmax","multi:softprob","reg:squarederror"
    "max_bin": 64,  # Max num of binning
    "subsample": 0.8,  # Subsample rate by rows
    "colsample_bytree": 0.9,  # Feature selection rate by tree
    "eval_metric": "auc",  # supported eval metric：
    "hess_key": "hess",  # Required, Mark hess columns, optionally choosing a column name that is not in the data set
    "grad_key": "grad",  # Required，Mark grad columns, optionally choosing a column name that is not in the data set
    "label_key": "Class",  # Required，ark label columns, optionally choosing a column name that is not in the data set
}
start = time.time()
modelsf = bst.train(hdf, hdf, params=params, num_boost_round=20)
print(f"train time: {time.time() - start}")

# 评估模型

In [None]:
model_path = {
    alice: "./test_alice.json",
    bob: "./test_bob.json",
}
bst.save_model(model_path)
result = bst.eval(model_path=model_path, hdata=hdf, params=params)
result

In [None]:
model_sf = xgb.Booster()  # 初始化一个Booster对象
model_sf.load_model("test_alice.json")  # 从JSON文件加载模型
temp_data = atest_df.copy()
x = temp_data.drop(columns="Class")
y = temp_data["Class"]
print(x.head())
print(y.head())
matrix = xgb.DMatrix(x)

In [None]:
predictions = model_sf.predict(matrix)
print(predictions)
sf_prediction = predictions  # 获得预测的概率
sf_proc_pred = np.where(sf_prediction > 0.5, 1, 0)  # 根据阈值将概率转换为标签
print(confusion_matrix(y.values, sf_proc_pred))
sf_cm = confusion_matrix(y.values, sf_proc_pred)

# 可视
plt.figure(figsize=(10, 7))
sns.heatmap(
    sf_cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Class 0", "Class 1"],
    yticklabels=["Class 0", "Class 1"],
)
plt.ylabel("actual")
plt.xlabel("predict")
plt.title("confuse matrix")
plt.show()

print(classification_report(y.values, sf_proc_pred))

fig, (ax) = plt.subplots(ncols=1, figsize=(8, 5))
xgb.plot_importance(
    model_sf, height=0.8, title="Features importance (XGBoost)", ax=ax, color="green"
)
plt.show()