In [1]:
import pandas as pd

In [2]:
feature_D = pd.read_excel(
    "../../Features/Results/feature_importance_D.xlsx", index_col=False
)


feature_P = pd.read_excel(
    "../../Features/Results/feature_importance_P.xlsx", index_col=False
)


feature_H = pd.read_excel(
    "../../Features/Results/feature_importance_H.xlsx", index_col=False
)


# 获取前20个特征


top_50_features_D = feature_D[:50]["Feature"]


top_50_features_H = feature_H[:50]["Feature"]


top_50_features_P = feature_P[:50]["Feature"]

In [3]:
import os
import json

# 定义存储 JSON 数据的字典
json_data = {}
# 指定 JSON 文件所在的目录
json_dir = "../../Features/Results/"
# 遍历目录中的所有文件
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):  # 检查文件是否为 JSON 文件
        file_path = os.path.join(json_dir, file_name)
        with open(file_path, "r") as file:
            json_data[file_name.replace(".json", "")] = json.load(file)

best_params_D = json_data["best_params_D"]
best_params_P = json_data["best_params_P"]
best_params_H = json_data["best_params_H"]

In [4]:
train = pd.read_excel("../../Dataset/train_data.xlsx", index_col=False)
test = pd.read_excel("../../Dataset/test_data.xlsx", index_col=False)
out_test = pd.read_excel("../../Dataset/data_out_feats.xlsx", index_col=False)
X_train_D = train[top_50_features_D]
X_train_P = train[top_50_features_P]
X_train_H = train[top_50_features_H]

X_test_D = test[top_50_features_D]
X_test_P = test[top_50_features_P]
X_test_H = test[top_50_features_H]

y_train_D = train["D"]
y_train_P = train["P"]
y_train_H = train["H"]

y_test_D = test["D"]
y_test_P = test["P"]
y_test_H = test["H"]

X_out_D = out_test[top_50_features_D]
X_out_P = out_test[top_50_features_P]
X_out_H = out_test[top_50_features_H]

y_out_D = out_test["D"]
y_out_P = out_test["P"]
y_out_H = out_test["H"]

In [5]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np


# Function to train model, get feature importance, and evaluate performance
def train_and_evaluate(X_train, y_train, X_test, y_test, X_out, y_out, best_params):
    # Train model with best hyperparameters
    model = ExtraTreesRegressor(**best_params, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_out_pred = model.predict(X_out)

    metrics = {
        "train_r2": r2_score(y_train, y_train_pred),
        "test_r2": r2_score(y_test, y_test_pred),
        "out_r2": r2_score(y_out, y_out_pred),
        "train_rmse": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "test_rmse": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "out_rmse": np.sqrt(mean_squared_error(y_out, y_out_pred)),
        "train_mae": mean_absolute_error(y_train, y_train_pred),
        "test_mae": mean_absolute_error(y_test, y_test_pred),
        "out_mae": mean_absolute_error(y_out, y_out_pred),
    }

    # Save original and predicted data
    train_results = pd.DataFrame({"y_train": y_train, "y_train_pred": y_train_pred})
    test_results = pd.DataFrame({"y_test": y_test, "y_test_pred": y_test_pred})
    out_results = pd.DataFrame({"y_out": y_out, "y_out_pred": y_out_pred})

    return model, metrics, train_results, test_results, out_results

In [6]:
# Train and evaluate model
model_D, metrics_D, train_results_D, test_results_D, out_results_D = train_and_evaluate(
    X_train_D, y_train_D, X_test_D, y_test_D, X_out_D, y_out_D, best_params_D
)
metrics_D

{'train_r2': 0.9695894270296452,
 'test_r2': 0.8088155281005586,
 'out_r2': 0.6290969094002017,
 'train_rmse': 0.3242819299127081,
 'test_rmse': 0.781787629271844,
 'out_rmse': 0.859555238936426,
 'train_mae': 0.23179972700489873,
 'test_mae': 0.44754989662538874,
 'out_mae': 0.6674641423237976}

In [7]:
# Train and evaluate model
model_P, metrics_P, train_results_P, test_results_P, out_results_P = train_and_evaluate(
    X_train_P, y_train_P, X_test_P, y_test_P, X_out_P, y_out_P, best_params_P
)
metrics_P

{'train_r2': 0.8866547293061107,
 'test_r2': 0.6693586418815856,
 'out_r2': 0.5552447048872282,
 'train_rmse': 1.4597949924697378,
 'test_rmse': 2.3881723516851165,
 'out_rmse': 3.507164494971362,
 'train_mae': 1.0684293908216256,
 'test_mae': 1.7155853948486224,
 'out_mae': 2.4534204693891897}

In [8]:
# Train and evaluate model
model_H, metrics_H, train_results_H, test_results_H, out_results_H = train_and_evaluate(
    X_train_H, y_train_H, X_test_H, y_test_H, X_out_H, y_out_H, best_params_H
)
metrics_H

{'train_r2': 0.9738663780975396,
 'test_r2': 0.8145000788859359,
 'out_r2': 0.9062383946399588,
 'train_rmse': 0.941290495287304,
 'test_rmse': 1.9169238337525158,
 'out_rmse': 2.2560126600260193,
 'train_mae': 0.5749317148737327,
 'test_mae': 1.2006718187687702,
 'out_mae': 1.7840750583104836}

In [9]:
from joblib import dump

dump(model_D, "./SHAP/model_D_50.joblib")
dump(model_P, "./SHAP/model_P_50.joblib")
dump(model_H, "./SHAP/model_H_50.joblib")

['./SHAP/model_H_50.joblib']

In [None]:
# 批量保存 metrics 到 Excel 文件
metrics_dict = {"Metrics_D": metrics_D, "Metrics_P": metrics_P, "Metrics_H": metrics_H}

output_path = "./results/metrics_summary_50.xlsx"

with pd.ExcelWriter(output_path) as writer:
    for sheet_name, metrics in metrics_dict.items():
        pd.DataFrame([metrics]).to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
output_path = "./results/train_results_summary_50.xlsx"

with pd.ExcelWriter(output_path) as writer:
    train_results_D.to_excel(writer, sheet_name="Train_Results_D", index=False)
    train_results_P.to_excel(writer, sheet_name="Train_Results_P", index=False)
    train_results_H.to_excel(writer, sheet_name="Train_Results_H", index=False)

In [None]:
output_path = "./results/test_results_summary_50.xlsx"

with pd.ExcelWriter(output_path) as writer:
    test_results_D.to_excel(writer, sheet_name="Test_Results_D", index=False)
    test_results_P.to_excel(writer, sheet_name="Test_Results_P", index=False)
    test_results_H.to_excel(writer, sheet_name="Test_Results_H", index=False)

In [None]:
output_path = "./results/out_results_summary_50.xlsx"

with pd.ExcelWriter(output_path) as writer:
    out_results_D.to_excel(writer, sheet_name="out_Results_D", index=False)
    out_results_P.to_excel(writer, sheet_name="out_Results_P", index=False)
    out_results_H.to_excel(writer, sheet_name="out_Results_H", index=False)