In [1]:
import pandas as pd
import numpy as np
import os

# input and output paths
input_dir = "E:/K-MELLODDY-Project/data/ADMET_PK_Public_Dataset/08_02_TDC_morgan_avalon_erg_selfies_rdkit_maccs_tfidf_embed64_variance/DILI/"

# input_dir = r"E:\K-MELLODDY-Project\data\ADMET_PK_Public_Dataset\06_10_TDC_morgan_avalon_erg_rdkit_maccs_tfidf_variance\DILI"

X_train_reduced_scaled = pd.read_csv(os.path.join(input_dir, "DILI_train_fp_features_reduced_scaled.csv"))
for_y_train = pd.read_csv(os.path.join(input_dir, "DILI_train_fp_features_raw.csv"))
y_train_fp = for_y_train.Label

X_val_reduced_scaled = pd.read_csv(os.path.join(input_dir, "DILI_valid_fp_features_reduced_scaled.csv"))
for_y_val = pd.read_csv(os.path.join(input_dir, "DILI_valid_fp_features_raw.csv"))
y_valid_fp = for_y_val.Label

X_test_reduced_scaled = pd.read_csv(os.path.join(input_dir, "DILI_test_fp_features_reduced_scaled.csv"))
for_y_test = pd.read_csv(os.path.join(input_dir, "DILI_test_fp_features_raw.csv"))
y_test_fp = for_y_test.Label

In [2]:
print(for_y_train.shape, X_train_reduced_scaled.shape, y_train_fp.shape)
print(for_y_val.shape, X_val_reduced_scaled.shape, y_valid_fp.shape)
print(for_y_test.shape, X_test_reduced_scaled.shape, y_test_fp.shape)

(332, 2972) (332, 64) (332,)
(48, 2972) (48, 64) (48,)
(95, 2972) (95, 64) (95,)


In [3]:
X_train_fp_clean = X_train_reduced_scaled
X_valid_fp_clean = X_val_reduced_scaled
X_test_fp_clean = X_test_reduced_scaled

y_train_fp_clean = y_train_fp
y_valid_fp_clean = y_valid_fp
y_test_fp_clean = y_test_fp

### Stacking Model

In [4]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from sklearn.svm import NuSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification # For a sample dataset
import os
import joblib

# Initialize Base Models
base_models = [
    ('et', ExtraTreesClassifier(random_state=42, n_jobs=-1)),   # CPU
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1)), # CPU
    ('lgbm', LGBMClassifier(
        device='cpu',
        gpu_platform_id=0,      # Replace with actual platform ID
        gpu_device_id=0,        # Ensure this corresponds to RTX 5070 Ti
        random_state=42,
        force_col_wise=True,
        verbose=-1
    )),
    ('xgb', XGBClassifier(
        tree_method='hist',
        device='cuda',
        predictor='auto',
        # gpu_id=0,
        eval_metric='logloss',
        random_state=42
    ))
]

# Initialize Meta-Learner (Final Estimator)
meta_learner = LogisticRegression(solver='liblinear', random_state=42)

# Create and Train the StackingClassifier
stacking_ensemble = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=10, # Number of cross-validation folds for training the meta-learner
    stack_method='auto', # 'auto' chooses 'predict_proba' if available, else 'predict'
    n_jobs=-1 # Use all available cores for parallel processing
)

# Fit the model on the whole training set
stacking_ensemble.fit(X_train_fp_clean, y_train_fp_clean)

0,1,2
,estimators,"[('et', ...), ('rf', ...), ...]"
,final_estimator,LogisticRegre...r='liblinear')
,cv,10
,stack_method,'auto'
,n_jobs,-1
,passthrough,False
,verbose,0

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,'cuda'
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


In [5]:
import os

save_path_output = input_dir + "/DILI_Outpus_reduced_scaled/"
save_path_plot = input_dir + "/DILI_Plots_reduced_scaled/"
os.makedirs(save_path_output, exist_ok=True)
os.makedirs(save_path_plot, exist_ok=True)
save_path_output, save_path_plot

('E:/K-MELLODDY-Project/data/ADMET_PK_Public_Dataset/08_02_TDC_morgan_avalon_erg_selfies_rdkit_maccs_tfidf_embed64_variance/DILI//DILI_Outpus_reduced_scaled/',
 'E:/K-MELLODDY-Project/data/ADMET_PK_Public_Dataset/08_02_TDC_morgan_avalon_erg_selfies_rdkit_maccs_tfidf_embed64_variance/DILI//DILI_Plots_reduced_scaled/')

In [6]:
import joblib

# Save the model
joblib.dump(stacking_ensemble, save_path_output + 'DILI_stacking_model.pkl')

# Load the saved model
loaded_model = joblib.load(save_path_output + 'DILI_stacking_model.pkl')

### Cross-Validation on Training Set

In [7]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import cross_validate
# from sklearn.metrics import (
#     make_scorer, accuracy_score, precision_score, recall_score,
#     f1_score, roc_auc_score, average_precision_score
# )

# # Define scoring metrics
# scoring = {
#     'accuracy': 'accuracy',
#     'precision': make_scorer(precision_score, average='weighted', zero_division=0),
#     'recall': make_scorer(recall_score, average='weighted', zero_division=0),
#     'f1': make_scorer(f1_score, average='weighted', zero_division=0),
#     'AUROC': make_scorer(roc_auc_score, needs_proba=True),
#     'AUPRC': make_scorer(average_precision_score, needs_proba=True)
# }

# # Perform cross-validation
# cv_results = cross_validate(
#     estimator=loaded_model,
#     X=X_train_fp_clean,
#     y=y_train_fp_clean,
#     scoring=scoring,
#     cv=5,
#     return_train_score=False,
#     n_jobs=-1
# )

# # Prepare summary with mean ± std
# metrics_summary = {
#     'Metric': [],
#     'Mean ± Std': []
# }

# for metric in scoring.keys():
#     values = cv_results[f'test_{metric}']
#     mean_val = np.mean(values)
#     std_val = np.std(values)
#     metrics_summary['Metric'].append(metric.upper())
#     metrics_summary['Mean ± Std'].append(f"{mean_val:.4f} ± {std_val:.4f}")

# # Create and save DataFrame
# metrics_df_sf = pd.DataFrame(metrics_summary)
# # print(metrics_df_sf.to_string(index=False))

# import joblib

# # Save the model to a file

# model_path = save_path_output + "DILI_stacking_ensemble_model.pkl"
# joblib.dump(stacking_ensemble, model_path)
# print(f"Model saved to: {model_path}")

# # Prediction Probabilities and Classes
# proba_train = stacking_ensemble.predict_proba(X_train_fp_clean)[:, 1]  # probability of class 1
# pred_class = stacking_ensemble.predict(X_train_fp_clean)

# # Build DataFrame
# ids = pd.Series(np.arange(len(proba_train)), name="SampleID")  # or SMILES if available
# proba_df = pd.DataFrame({
#     "ID": ids,
#     "True_Label": y_train_fp_clean.reset_index(drop=True),
#     "Predicted_Class": pred_class,
#     "Predicted_Prob": proba_train
# })

# # Save probabilities
# proba_file = os.path.join(save_path_output, "DILI_train_predicted_probabilities.csv")
# proba_df.to_csv(proba_file, index=False)
# print(f"Predictions saved to: {proba_file}")

# # Save metrics
# metrics_df_sf.to_csv(save_path_output + "DILI_train_cv_metrics_summary.csv", index=False)
# metrics_df_sf


### Evaluate on Validation and Test Sets

In [8]:
import numpy as np
import pandas as pd
import os
import joblib
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

# Define Evaluation Function
def evaluate_metrics(model, X, y, name="Set"):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1] if len(np.unique(y)) == 2 else None

    results = {
        "Set": name,
        "Accuracy": accuracy_score(y, y_pred),
        "Precision": precision_score(y, y_pred, average='weighted', zero_division=0),
        "Recall": recall_score(y, y_pred, average='weighted', zero_division=0),
        "F1 Score": f1_score(y, y_pred, average='weighted', zero_division=0)
    }

    if y_proba is not None:
        results["AUROC"] = roc_auc_score(y, y_proba)
        results["AUPRC"] = average_precision_score(y, y_proba)

    # Also return predictions for saving
    return results, y_pred, y_proba

# Evaluate on Validation and Test
val_results, y_val_pred, y_val_proba = evaluate_metrics(loaded_model, X_valid_fp_clean, y_valid_fp_clean, name="Validation")
test_results, y_test_pred, y_test_proba = evaluate_metrics(loaded_model, X_test_fp_clean, y_test_fp_clean, name="Test")

# Save Evaluation Metrics
metrics_eval_df = pd.DataFrame([val_results, test_results])
print(metrics_eval_df.to_string(index=False))
metrics_eval_df.to_csv(save_path_output + "DILI_validation_test_metrics_sf_reduced_scaled.csv", index=False)

# Save Actual vs Predicted
val_df = pd.DataFrame({
    'Set': 'Validation',
    'Actual': y_valid_fp_clean,
    'Predicted': y_val_pred
})
if y_val_proba is not None:
    val_df['Predicted_Prob'] = y_val_proba

test_df = pd.DataFrame({
    'Set': 'Test',
    'Actual': y_test_fp_clean,
    'Predicted': y_test_pred
})
if y_test_proba is not None:
    test_df['Predicted_Prob'] = y_test_proba

# Combine and save
results_df = pd.concat([val_df, test_df], ignore_index=True)
results_df.to_csv(save_path_output + "DILI_actual_vs_predicted_sf_reduced_scaled.csv", index=False)

results_df

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


       Set  Accuracy  Precision   Recall  F1 Score    AUROC    AUPRC
Validation  0.875000   0.875000 0.875000  0.875000 0.924162 0.934341
      Test  0.757895   0.757955 0.757895  0.757841 0.855496 0.872648


Unnamed: 0,Set,Actual,Predicted,Predicted_Prob
0,Validation,0.0,0.0,0.090919
1,Validation,0.0,0.0,0.396511
2,Validation,1.0,1.0,0.859162
3,Validation,0.0,0.0,0.194967
4,Validation,0.0,0.0,0.108578
...,...,...,...,...
138,Test,1.0,1.0,0.898738
139,Test,0.0,0.0,0.096763
140,Test,1.0,1.0,0.914967
141,Test,1.0,1.0,0.867844


### Fold-wise Test Evaluation with Standard Deviation

In [9]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

def evaluate_with_std(model, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics_list = []
    all_preds = []  # To store actual and predicted values

    X = pd.DataFrame(X)
    y = pd.Series(y).reset_index(drop=True)

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        X_split, y_split = X.iloc[test_idx], y.iloc[test_idx]

        # Evaluate metrics and get predictions
        metrics, y_pred, y_proba = evaluate_metrics(model, X_split, y_split, name=f"Fold-{fold}")
        metrics_list.append(metrics)

        fold_df = pd.DataFrame({
            'Fold': fold,
            'Actual': y_split.values,
            'Predicted': y_pred
        })

        if y_proba is not None:
            fold_df['Predicted_Prob'] = y_proba

        all_preds.append(fold_df)

    # Combine prediction results
    preds_df = pd.concat(all_preds, ignore_index=True)
    preds_df.to_csv(save_path_output + "DILI_test_fold_predictions.csv", index=False)

    # Convert to DataFrame for metrics
    df_metrics = pd.DataFrame(metrics_list)

    # Calculate mean and std
    mean_metrics = df_metrics.mean(numeric_only=True)
    std_metrics = df_metrics.std(numeric_only=True)

    summary_df = pd.DataFrame({
        "Metric": mean_metrics.index,
        "Mean": mean_metrics.values,
        "Std": std_metrics.values
    })

    return summary_df

# Evaluate and save
test_metrics_summary = evaluate_with_std(loaded_model, X_test_fp_clean, y_test_fp_clean)
test_metrics_summary.to_csv(save_path_output + "DILI_test_metrics_with_std.csv", index=False)

test_metrics_summary

Unnamed: 0,Metric,Mean,Std
0,Accuracy,0.757895,0.142203
1,Precision,0.772158,0.142718
2,Recall,0.757895,0.142203
3,F1 Score,0.753879,0.145315
4,AUROC,0.857778,0.15484
5,AUPRC,0.895161,0.104674


In [10]:
import pandas as pd

# Read the saved prediction file
preds_df = pd.read_csv(save_path_output + "DILI_test_fold_predictions.csv")

# Initialize a dictionary to store predictions per fold
fold_columns = {}

# Loop through each fold
for fold in sorted(preds_df['Fold'].unique()):
    fold_name = f"Fold_{fold}"
    
    # Extract predicted probabilities for that fold
    fold_probs = preds_df[preds_df['Fold'] == fold]['Predicted_Prob'].reset_index(drop=True)
    
    # Store in dictionary
    fold_columns[fold_name] = fold_probs

# Create DataFrame with fold-wise predicted probabilities
folds_df = pd.DataFrame(fold_columns)

folds_df.to_csv(save_path_output + "DILI_fold_predicted_probs_by_column.csv", index=False)

folds_df

Unnamed: 0,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5
0,0.082657,0.874904,0.530984,0.336162,0.894217
1,0.100612,0.282728,0.869623,0.257759,0.195925
2,0.78923,0.579179,0.63923,0.10418,0.779671
3,0.364254,0.89595,0.088927,0.892296,0.091067
4,0.890119,0.613469,0.783775,0.254927,0.63417
5,0.261641,0.142825,0.892113,0.895675,0.648458
6,0.174755,0.560276,0.094004,0.901882,0.892878
7,0.312019,0.755201,0.593266,0.086271,0.848215
8,0.907468,0.127303,0.839176,0.24257,0.163366
9,0.890251,0.280179,0.884362,0.499961,0.347105


### Retrieve the Default Parameters

In [12]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# Instantiate models with defaults
et_default = ExtraTreesClassifier(random_state=42, n_jobs=-1)
rf_default = RandomForestClassifier(random_state=42, n_jobs=-1)
lgbm_default = LGBMClassifier(
    device='cpu',
    gpu_platform_id=0,      # Replace with actual platform ID
    gpu_device_id=0,        # Ensure this corresponds to RTX 5070 Ti
    random_state=42,
    force_col_wise=True,
    verbose=-1
)
xgb_default = XGBClassifier(
    tree_method='hist',
    device='cuda',
    predictor='auto',
    # gpu_id=0,
    eval_metric='logloss',
    random_state=42
)
logreg_default = LogisticRegression(solver='liblinear', random_state=42)

# # Instantiate models with defaults
# et_default = ExtraTreesClassifier()
# rf_default = RandomForestClassifier()
# lgbm_default = LGBMClassifier()
# xgb_default = XGBClassifier()
# logreg_default = LogisticRegression()

# Print all default parameters
print("ExtraTreesClassifier defaults:\n", et_default.get_params(), "\n")
print("RandomForestClassifier defaults:\n", rf_default.get_params(), "\n")
print("LightGBMClassifier defaults:\n", lgbm_default.get_params(), "\n")
print("XGBClassifier defaults:\n", xgb_default.get_params(), "\n")
print("LogisticRegression defaults:\n", logreg_default.get_params(), "\n")


ExtraTreesClassifier defaults:
 {'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} 

RandomForestClassifier defaults:
 {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} 

LightGBMClassifier defaults:
 {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'i

In [13]:
import pprint
pprint.pprint(logreg_default.get_params())


{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


In [None]:
Best ExtraTrees params: {'n_estimators': 359, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 3, 'bootstrap': True}
Best ExtraTrees ROC-AUC: 0.703451233718098