In [1]:
import pandas as pd

In [2]:
feature_D = pd.read_excel(
    "../../Features/Results/feature_importance_D.xlsx", index_col=False
)


feature_P = pd.read_excel(
    "../../Features/Results/feature_importance_P.xlsx", index_col=False
)


feature_H = pd.read_excel(
    "../../Features/Results/feature_importance_H.xlsx", index_col=False
)


# 获取前20个特征


top_20_features_D = feature_D[:20]["Feature"]


top_20_features_H = feature_H[:20]["Feature"]


top_20_features_P = feature_P[:20]["Feature"]

In [3]:
import os
import json

# 定义存储 JSON 数据的字典
json_data = {}
# 指定 JSON 文件所在的目录
json_dir = "../../Features/Results/"
# 遍历目录中的所有文件
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):  # 检查文件是否为 JSON 文件
        file_path = os.path.join(json_dir, file_name)
        with open(file_path, "r") as file:
            json_data[file_name.replace(".json", "")] = json.load(file)

best_params_D = json_data["best_params_D"]
best_params_P = json_data["best_params_P"]
best_params_H = json_data["best_params_H"]

In [4]:
train = pd.read_excel("../../Dataset/train_data.xlsx", index_col=False)
test = pd.read_excel("../../Dataset/test_data.xlsx", index_col=False)
out_test = pd.read_excel("../../Dataset/data_out_feats.xlsx", index_col=False)
X_train_D = train[top_20_features_D]
X_train_P = train[top_20_features_P]
X_train_H = train[top_20_features_H]

X_test_D = test[top_20_features_D]
X_test_P = test[top_20_features_P]
X_test_H = test[top_20_features_H]

y_train_D = train["D"]
y_train_P = train["P"]
y_train_H = train["H"]

y_test_D = test["D"]
y_test_P = test["P"]
y_test_H = test["H"]

X_out_D = out_test[top_20_features_D]
X_out_P = out_test[top_20_features_P]
X_out_H = out_test[top_20_features_H]

y_out_D = out_test["D"]
y_out_P = out_test["P"]
y_out_H = out_test["H"]

In [5]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np


# Function to train model, get feature importance, and evaluate performance
def train_and_evaluate(X_train, y_train, X_test, y_test, X_out, y_out, best_params):
    # Train model with best hyperparameters
    model = ExtraTreesRegressor(**best_params, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_out_pred = model.predict(X_out)

    metrics = {
        "train_r2": r2_score(y_train, y_train_pred),
        "test_r2": r2_score(y_test, y_test_pred),
        "out_r2": r2_score(y_out, y_out_pred),
        "train_rmse": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "test_rmse": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "out_rmse": np.sqrt(mean_squared_error(y_out, y_out_pred)),
        "train_mae": mean_absolute_error(y_train, y_train_pred),
        "test_mae": mean_absolute_error(y_test, y_test_pred),
        "out_mae": mean_absolute_error(y_out, y_out_pred),
    }

    # Save original and predicted data
    train_results = pd.DataFrame({"y_train": y_train, "y_train_pred": y_train_pred})
    test_results = pd.DataFrame({"y_test": y_test, "y_test_pred": y_test_pred})
    out_results = pd.DataFrame({"y_out": y_out, "y_out_pred": y_out_pred})

    return model, metrics, train_results, test_results, out_results

In [6]:
# Train and evaluate model
model_D, metrics_D, train_results_D, test_results_D, out_results_D = train_and_evaluate(
    X_train_D, y_train_D, X_test_D, y_test_D, X_out_D, y_out_D, best_params_D
)
metrics_D

{'train_r2': 0.9666952905570808,
 'test_r2': 0.8048433107769872,
 'out_r2': 0.6614037045118641,
 'train_rmse': 0.33936204804414194,
 'test_rmse': 0.7898674316399734,
 'out_rmse': 0.8212675564979134,
 'train_mae': 0.2468813561005935,
 'test_mae': 0.47024710739057923,
 'out_mae': 0.6603315246555376}

In [7]:
# Train and evaluate model
model_P, metrics_P, train_results_P, test_results_P, out_results_P = train_and_evaluate(
    X_train_P, y_train_P, X_test_P, y_test_P, X_out_P, y_out_P, best_params_P
)
metrics_P

{'train_r2': 0.8552645124738837,
 'test_r2': 0.6209766095461295,
 'out_r2': 0.5959479461857095,
 'train_rmse': 1.6495963299906915,
 'test_rmse': 2.5569373326033733,
 'out_rmse': 3.3428295417509943,
 'train_mae': 1.174159126831605,
 'test_mae': 1.8276016100813985,
 'out_mae': 2.2621316685258273}

In [8]:
# Train and evaluate model
model_H, metrics_H, train_results_H, test_results_H, out_results_H = train_and_evaluate(
    X_train_H, y_train_H, X_test_H, y_test_H, X_out_H, y_out_H, best_params_H
)
metrics_H

{'train_r2': 0.970942638611972,
 'test_r2': 0.8320622125558479,
 'out_r2': 0.8628980452198001,
 'train_rmse': 0.9925490066238606,
 'test_rmse': 1.8239259657458495,
 'out_rmse': 2.7280406577830396,
 'train_mae': 0.6104034745866798,
 'test_mae': 1.1821497384619999,
 'out_mae': 1.9881258005170956}

In [9]:
from joblib import dump

dump(model_D, "./SHAP/model_D.joblib")
dump(model_P, "./SHAP/model_P.joblib")
dump(model_H, "./SHAP/model_H.joblib")

['./SHAP/model_H.joblib']

In [None]:
# 批量保存 metrics 到 Excel 文件
metrics_dict = {"Metrics_D": metrics_D, "Metrics_P": metrics_P, "Metrics_H": metrics_H}

output_path = "./results/metrics_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    for sheet_name, metrics in metrics_dict.items():
        pd.DataFrame([metrics]).to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
output_path = "./results/train_results_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    train_results_D.to_excel(writer, sheet_name="Train_Results_D", index=False)
    train_results_P.to_excel(writer, sheet_name="Train_Results_P", index=False)
    train_results_H.to_excel(writer, sheet_name="Train_Results_H", index=False)

In [None]:
output_path = "./results/test_results_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    test_results_D.to_excel(writer, sheet_name="Test_Results_D", index=False)
    test_results_P.to_excel(writer, sheet_name="Test_Results_P", index=False)
    test_results_H.to_excel(writer, sheet_name="Test_Results_H", index=False)

In [None]:
output_path = "./results/out_results_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    out_results_D.to_excel(writer, sheet_name="out_Results_D", index=False)
    out_results_P.to_excel(writer, sheet_name="out_Results_P", index=False)
    out_results_H.to_excel(writer, sheet_name="out_Results_H", index=False)