In [1]:
import sys
sys.path.append("..")

import matplotlib.pyplot as plt
import seaborn as sns

from chemgnn.data import load_csv, split_train_test
from chemgnn.datasets.tabular import build_rf_dataset
from chemgnn.models.rf import build_rf
from chemgnn.train.rf_train import (
    cross_validate_rf,
    grid_search_rf,
    train_and_evaluate_rf
)

In [4]:
# 1. raw CSV load
df = load_csv("data_HoC_cyclic_augmented.csv")

# 2. descriptor 계산 + feature/target 분리 (⭐ 핵심 ⭐)
X, y = build_rf_dataset(
    df,
    smiles_col="SMILES",
    target_col="HoC",
)

# 3. split
X_train, X_test, y_train, y_test = split_train_test(
    X, y, test_size=0.2, random_state=41
)

In [5]:
rf = build_rf()
cv_scores = cross_validate_rf(rf, X_train, y_train)
print("CV R2:", cv_scores)

CV R2: [0.93367462 0.87763935 0.93504709 0.43032598 0.87795658]


In [6]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

gs = grid_search_rf(build_rf(), param_grid, X_train, y_train)
print("Best params:", gs.best_params_)

Best params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [7]:
best_rf = build_rf(**gs.best_params_)

results = train_and_evaluate_rf(
    best_rf, X_train, y_train, X_test, y_test
)

print(results)

{'r2_train': 0.9829499776332257, 'r2_test': 0.9601429060610547, 'mae_train': 0.3356011090909067, 'mae_test': 1.0200480909090874, 'y_train_pred': array([-10.437972 ,  -6.6012268,  -5.1583804,  -4.6501458,  -9.183932 ,
       -12.5952682, -11.768516 , -10.214187 , -11.927263 ,  -6.8966144,
        -2.7903   , -11.992079 ,  -6.303328 , -12.476165 ,  -8.690887 ,
       -12.968652 ,  -6.2837638,  -9.393904 , -19.75067  ,  -3.023328 ,
        -9.718219 ,  -5.0352352, -10.401469 ,  -2.327711 ,  -3.7122176,
       -18.261908 ,  -7.44441  ,  -7.76517  ,  -4.136765 ,  -3.36304  ,
       -11.4423212,  -4.6203644,  -7.409067 ,  -6.4627336,  -5.4310102,
        -3.700536 ,  -5.511107 ,  -7.230198 ,  -5.0443066,  -2.1182226,
       -15.699749 , -19.331116 , -16.690147 ,  -3.9921274]), 'y_test_pred': array([ -5.2023634, -14.8603224, -16.818023 ,  -7.2205956, -17.449323 ,
        -4.9966348,  -2.4986984,  -2.468847 , -18.400213 , -12.8655684,
       -13.581064 ])}


In [8]:
from chemgnn.evaluate import regression_metrics

train_pred = results["y_train_pred"]
test_pred  = results["y_test_pred"]

metrics_train = regression_metrics(y_train, train_pred)
metrics_test  = regression_metrics(y_test, test_pred)

print(metrics_train)
print(metrics_test)


{'MAE': 0.3356011090909067, 'RMSE': 0.6327041653901451, 'R2': 0.9829499776332257}
{'MAE': 1.0200480909090874, 'RMSE': 1.3823311131735818, 'R2': 0.9601429060610547}
