In [1]:
import os
import sys

cur_dir = os.getcwd()
pkg_rootdir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir)))
# print(pkg_rootdir)
if pkg_rootdir not in sys.path:
    sys.path.append(pkg_rootdir)
# print(sys.path)

In [2]:
from Utils.mol2fp import GetPubChemFPs

fp_generator = GetPubChemFPs

In [3]:
import pandas as pd

train = pd.read_excel("../../../Dataset/train_data.xlsx", index_col=False)
test = pd.read_excel("../../../Dataset/test_data.xlsx", index_col=False)
out_test = pd.read_excel("../../../Dataset/data_out_feats.xlsx", index_col=False)
X_train = [fp_generator(smi) for smi in train["smiles"]]
X_test = [fp_generator(smi) for smi in test["smiles"]]
X_out = [fp_generator(smi) for smi in out_test["smiles"]]
# Convert feature lists to DataFrames
X_train = pd.DataFrame(
    X_train, columns=[f"feature_{i+1}" for i in range(len(X_train[0]))]
)
X_test = pd.DataFrame(X_test, columns=[f"feature_{i+1}" for i in range(len(X_test[0]))])
X_out = pd.DataFrame(X_out, columns=[f"feature_{i+1}" for i in range(len(X_out[0]))])


y_train_D = train["D"]
y_train_P = train["P"]
y_train_H = train["H"]

y_test_D = test["D"]
y_test_P = test["P"]
y_test_H = test["H"]

y_out_D = out_test["D"]
y_out_P = out_test["P"]
y_out_H = out_test["H"]



In [None]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesRegressor
import numpy as np
import json


# Define the objective function for hyperparameter optimization
def objective(params, X_train, y_train):
    model = ExtraTreesRegressor(**params, random_state=42)
    # Perform cross-validation and minimize the negative mean squared error
    score = -np.mean(
        cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
    )
    return {"loss": score, "status": STATUS_OK}


# Define the search space for hyperparameters
space = {
    "n_estimators": hp.choice("n_estimators", [50, 100, 200]),
    "max_depth": hp.choice("max_depth", [5, 10, 15]),
    # for polar forces
    # "max_depth": hp.choice("max_depth", [2, 5, 10]), 
    "min_samples_split": hp.choice("min_samples_split", [2, 5, 10]),
    "min_samples_leaf": hp.choice("min_samples_leaf", [1, 2, 4]),
    "criterion": "friedman_mse",
    "bootstrap": True,
    "oob_score": True,
}


# Function to optimize and save best parameters
def optimize_and_save(X_train, y_train, output_path):
    trials = Trials()
    best_params = fmin(
        fn=lambda params: objective(params, X_train, y_train),
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials,
    )

    # Convert categorical parameters back to their original values
    best_params["n_estimators"] = [50, 100, 200][best_params["n_estimators"]]
    best_params["max_depth"] = [5, 10, 15][best_params["max_depth"]]
    # best_params["max_depth"] = [2, 5, 10][best_params["max_depth"]]
    best_params["min_samples_split"] = [2, 5, 10][best_params["min_samples_split"]]
    best_params["min_samples_leaf"] = [1, 2, 4][best_params["min_samples_leaf"]]
    best_params["criterion"] = "friedman_mse"
    best_params["bootstrap"] = True
    best_params["oob_score"] = True

    with open(output_path, "w") as f:
        json.dump(best_params, f)

    return best_params

In [None]:
# Optimize and save parameters for D, P, and H
best_params_D = optimize_and_save(
    X_train, y_train_D, "./results/pubchem/best_params_D.json"
)
best_params_P = optimize_and_save(
    X_train, y_train_P, "./results/pubchem/best_params_P.json"
)
best_params_H = optimize_and_save(
    X_train, y_train_H, "./results/pubchem/best_params_H.json"
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [03:43<00:00,  4.47s/trial, best loss: 7.2039471472287415]


In [11]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np


# Function to train model, get feature importance, and evaluate performance
def train_and_evaluate(X_train, y_train, X_test, y_test, X_out, y_out, best_params):
    # Train model with best hyperparameters
    model = ExtraTreesRegressor(**best_params, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_out_pred = model.predict(X_out)

    metrics = {
        "train_r2": r2_score(y_train, y_train_pred),
        "test_r2": r2_score(y_test, y_test_pred),
        "out_r2": r2_score(y_out, y_out_pred),
        "train_rmse": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "test_rmse": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "out_rmse": np.sqrt(mean_squared_error(y_out, y_out_pred)),
        "train_mae": mean_absolute_error(y_train, y_train_pred),
        "test_mae": mean_absolute_error(y_test, y_test_pred),
        "out_mae": mean_absolute_error(y_out, y_out_pred),
    }

    # Save original and predicted data
    train_results = pd.DataFrame({"y_train": y_train, "y_train_pred": y_train_pred})
    test_results = pd.DataFrame({"y_test": y_test, "y_test_pred": y_test_pred})
    out_results = pd.DataFrame({"y_out": y_out, "y_out_pred": y_out_pred})

    return model, metrics, train_results, test_results, out_results

In [7]:
# Train and evaluate model
model_D, metrics_D, train_results_D, test_results_D, out_results_D = train_and_evaluate(
    X_train, y_train_D, X_test, y_test_D, X_out, y_out_D, best_params_D
)
metrics_D

{'train_r2': 0.9467874124115552,
 'test_r2': 0.8222192423963888,
 'out_r2': 0.708915731142649,
 'train_rmse': 0.42896082963645205,
 'test_rmse': 0.7538845836997027,
 'out_rmse': 0.7614702373567731,
 'train_mae': 0.3122890685538545,
 'test_mae': 0.5105952032617074,
 'out_mae': 0.6083745044896239}

In [12]:
# Train and evaluate model
model_P, metrics_P, train_results_P, test_results_P, out_results_P = train_and_evaluate(
    X_train, y_train_P, X_test, y_test_P, X_out, y_out_P, best_params_P
)
metrics_P

{'train_r2': 0.8651362294910319,
 'test_r2': 0.6098725043244676,
 'out_r2': 0.6886733764731174,
 'train_rmse': 1.592347375042693,
 'test_rmse': 2.5941217680159094,
 'out_rmse': 2.934294522918221,
 'train_mae': 1.1652350320048768,
 'test_mae': 1.8102843309016374,
 'out_mae': 1.8231058106891647}

In [9]:
# Train and evaluate model
model_H, metrics_H, train_results_H, test_results_H, out_results_H = train_and_evaluate(
    X_train, y_train_H, X_test, y_test_H, X_out, y_out_H, best_params_H
)
metrics_H

{'train_r2': 0.967374687669358,
 'test_r2': 0.8279945537432964,
 'out_r2': 0.8325889942426994,
 'train_rmse': 1.0517226103339663,
 'test_rmse': 1.8458826666086563,
 'out_rmse': 3.0145397709157855,
 'train_mae': 0.6779456707668011,
 'test_mae': 1.186951662224443,
 'out_mae': 1.9305252618428022}

In [None]:
from joblib import dump

dump(model_D, "../SHAP/model_D.joblib")
dump(model_P, "../SHAP/model_P.joblib")
dump(model_H, "../SHAP/model_H.joblib")

['../SHAP/model_P.joblib']

In [12]:
# 批量保存 metrics 到 Excel 文件
metrics_dict = {"Metrics_D": metrics_D, "Metrics_P": metrics_P, "Metrics_H": metrics_H}

output_path = "./results/pubchem/metrics_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    for sheet_name, metrics in metrics_dict.items():
        pd.DataFrame([metrics]).to_excel(writer, sheet_name=sheet_name, index=False)

In [13]:
output_path = "./results/pubchem/train_results_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    train_results_D.to_excel(writer, sheet_name="Train_Results_D", index=False)
    train_results_P.to_excel(writer, sheet_name="Train_Results_P", index=False)
    train_results_H.to_excel(writer, sheet_name="Train_Results_H", index=False)

In [14]:
output_path = "./results/pubchem/test_results_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    test_results_D.to_excel(writer, sheet_name="Test_Results_D", index=False)
    test_results_P.to_excel(writer, sheet_name="Test_Results_P", index=False)
    test_results_H.to_excel(writer, sheet_name="Test_Results_H", index=False)

In [15]:
output_path = "./results/pubchem/out_results_summary.xlsx"

with pd.ExcelWriter(output_path) as writer:
    out_results_D.to_excel(writer, sheet_name="out_Results_D", index=False)
    out_results_P.to_excel(writer, sheet_name="out_Results_P", index=False)
    out_results_H.to_excel(writer, sheet_name="out_Results_H", index=False)