# Model training tests

In [1]:
## Import packages
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from flaml import AutoML
from sklearn.metrics import auc, roc_curve, classification_report


DATAPATH = "../data"
FIGUREPATH = "../figures"
MODELPATH = "../../checkpoints/"

  from .autonotebook import tqdm as notebook_tqdm
2024-02-06 12:53:05,738	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-02-06 12:53:05,803	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## PADEL Descriptors
The authors of the publication originally try PADEL descriptors to train the model. Because Padel descriptors take a long time to be calculated, we only do it once for the whole file and save the results

In [None]:
from padelpy import from_smiles

df = pd.read_csv(os.path.join(DATAPATH, "training_set.csv"))
unprocessed_idx = []
descs = []
for i,smi in enumerate(df["smiles"].tolist()):
    try:
        descriptors = from_smiles([smi])
        descs.extend(descriptors)
    except:
        print(f"Error processing SMILES: {smi}")
        unprocessed_idx += [i]

descs_df = pd.DataFrame(descs)

df_ = df.drop(unprocessed_idx)
df_padel = pd.concat([df_.reset_index(drop=True), descs_df.reset_index(drop=True)], axis=1)


# Save the updated DataFrame to a CSV file
df_padel.to_csv(os.path.join(DATAPATH, "training_set_padel.csv"), index=False)

In [None]:
len(df_padel)

In [None]:
df = pd.read_csv(os.path.join(DATAPATH, "training_set_padel.csv"))
df = df[["smiles", "inchikey", "outcome"]]
df.to_csv(os.path.join(DATAPATH, "training_set_clean.csv"), index=False)

In [None]:
# TRAIN TEST SPLITS

# we will always use the same train test split, saving the files to reuse them
# ONLY RUN ONCE
from sklearn.model_selection import train_test_split

def random_split(df, size):
    indices = np.arange(len(df))
    X_train, X_test, y_train, y_test, i_train, i_test = train_test_split(df["smiles"], df["outcome"], indices, test_size=size, stratify=df["outcome"])
    train = df.iloc[i_train]
    test = df.iloc[i_test]
    return train, test

df = pd.read_csv(os.path.join(DATAPATH, "training_set_clean.csv"))
for i in range(5):
    train, test = random_split(df, 0.2)
    train.to_csv(os.path.join(DATAPATH, "train_test_splits", "train_{}.csv".format(i)), index=False)
    test.to_csv(os.path.join(DATAPATH, "train_test_splits", "test_{}.csv".format(i)), index=False)

## AutoML
We use AutoML for the Padel descriptors and LazyQSAR for the Ersilia Embeddings

### Padel descriptors

In [None]:
# Model training with PADEL Descriptors
%%capture

import logging
logging.getLogger('flaml.automl').setLevel(logging.WARNING) # Suppress FLAML INFO logging

from sklearn.metrics import roc_curve, auc, classification_report

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
all_true_labels = []
all_predictions = []

df = pd.read_csv(os.path.join(DATAPATH, "training_set_padel.csv"))
#impute any missing values
for column in df.columns[3:]:
    column_mean = df[column].mean()
    df[column].fillna(column_mean, inplace=True)

#change infinity values
df.replace([np.inf, -np.inf], 1e6, inplace=True)

for i in range(5):
    train_file = f"train_{i}.csv"
    test_file = f"test_{i}.csv"
    train = pd.read_csv(os.path.join(DATAPATH, "train_test_splits", train_file))
    test = pd.read_csv(os.path.join(DATAPATH, "train_test_splits",test_file))
    train_descs = pd.merge(train, df, on=['inchikey', "smiles", "outcome"], how='left')
    test_descs = pd.merge(test, df, on =['inchikey', "smiles", "outcome"], how = "left")
    y_train = train["outcome"]
    y_test = test["outcome"]
    X_train = train_descs.iloc[:, 4:]
    X_test = test_descs.iloc[:, 4:]

    mdl = AutoML(task="classification", time_budget=600, logistic_max_iter=40000)
    mdl.fit(X_train, y_train)
    y_pred = mdl.predict_proba(X_test)[:,1]
    y_pred_bin = [1 if y > 0.5 else 0 for y in y_pred]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

    # Accumulate true labels and predictions
    all_true_labels.extend(y_test)
    all_predictions.extend(y_pred_bin)

# Calculate and print the average classification report
print("\nAverage Classification Report Across All Folds:\n")
print(classification_report(all_true_labels, all_predictions))

# Calculate mean ROC curve and AUC
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

# Plot mean ROC curve with boundaries
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Cross-Validation ROC of AutoML', fontsize=18)
plt.legend(loc="lower right", prop={'size': 15})
plt.savefig(os.path.join(FIGUREPATH, "Padel_AutoML_600s.png"))

### Ersilia Embeddings

In [None]:
%%capture
import lazyqsar as lq

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
all_true_labels = []
all_predictions = []

for i in range(5):
    train_file = f"train_{i}.csv"
    test_file = f"test_{i}.csv"
    train_set = pd.read_csv(os.path.join(DATAPATH, "train_test_splits", train_file))
    test_set = pd.read_csv(os.path.join(DATAPATH, "train_test_splits",test_file))
    X_train = train_set["smiles"]
    y_train = train_set["outcome"]
    X_test = test_set["smiles"]
    y_test = test_set["outcome"]


    # Fit the model on the training set for the current fold
    model = lq.ErsiliaBinaryClassifier(time_budget_sec=600, estimator_list=["rf", "lgbm", "xgboost"])
    model.fit(X_train, y_train)

    # Obtain predictions and true labels for the current fold
    y_hat_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_hat_proba)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

    # Accumulate true labels and predictions
    all_true_labels.extend(y_test)
    all_predictions.extend(y_pred)

# Calculate and print the average classification report
print("\nAverage Classification Report Across All Folds:\n")
print(classification_report(all_true_labels, all_predictions))

# Calculate mean ROC curve and AUC
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

# Plot mean ROC curve with boundaries
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Cross-Validation ROC of AutoML Morgan', fontsize=18)
plt.legend(loc="lower right", prop={'size': 15})
plt.savefig(os.path.join(FIGUREPATH, "Ersilia_AutoML_600s.png"), dpi=300)

### Morgan Fingerprints

In [None]:
%%capture

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
all_true_labels = []
all_predictions = []

for i in range(5):
    train_file = f"train_{i}.csv"
    test_file = f"test_{i}.csv"
    train_set = pd.read_csv(os.path.join(DATAPATH, "train_test_splits", train_file))
    test_set = pd.read_csv(os.path.join(DATAPATH, "train_test_splits",test_file))
    X_train = train_set["smiles"]
    y_train = train_set["outcome"]
    X_test = test_set["smiles"]
    y_test = test_set["outcome"]


    # Fit the model on the training set for the current fold
    model = lq.MorganBinaryClassifier(time_budget_sec=600, estimator_list=["rf", "lgbm", "xgboost"])
    model.fit(X_train, y_train)

    # Obtain predictions and true labels for the current fold
    y_hat_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_hat_proba)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

    # Accumulate true labels and predictions
    all_true_labels.extend(y_test)
    all_predictions.extend(y_pred)

# Calculate and print the average classification report
print("\nAverage Classification Report Across All Folds:\n")
print(classification_report(all_true_labels, all_predictions))

# Calculate mean ROC curve and AUC
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

# Plot mean ROC curve with boundaries
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Cross-Validation ROC of AutoML Morgan', fontsize=18)
plt.legend(loc="lower right", prop={'size': 15})
plt.savefig(os.path.join(FIGUREPATH, "Morgan_AutoML_600s.png"), dpi=300)

Of the AutoML models, the Padel is the one that does better, followed by Ersilia Embeddings. Padel descriptors are slower to generate, so we might want to use Ersilia Embeddings instead

## AutoGluon
We will try the AutoGluon method with the three descriptors used before

In [None]:
from autogluon.tabular import TabularPredictor

In [None]:
%%capture

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
all_true_labels = []
all_predictions = []

df = pd.read_csv(os.path.join(DATAPATH, "training_set_padel.csv"))
#impute any missing values
for column in df.columns[3:]:
    column_mean = df[column].mean()
    df[column].fillna(column_mean, inplace=True)

#change infinity values
df.replace([np.inf, -np.inf], 1e6, inplace=True)

for i in range(5):
    train_file = f"train_{i}.csv"
    test_file = f"test_{i}.csv"
    train = pd.read_csv(os.path.join(DATAPATH, "train_test_splits", train_file))
    test = pd.read_csv(os.path.join(DATAPATH, "train_test_splits",test_file))
    train_descs = pd.merge(train, df, on=['inchikey', "smiles", "outcome"], how='left')
    test_descs = pd.merge(test, df, on =['inchikey', "smiles", "outcome"], how = "left")
    y_train = train["outcome"]
    y_test = test["outcome"]
    X_train = train_descs.iloc[:, 4:]
    X_test = test_descs.iloc[:, 4:]
    X_train["outcome"] = y_train #add outcome again for the Tabular Predictor requirement

    fit_args = {}
    fit_args['time_limit'] =  600
    predictor = TabularPredictor(label="outcome").fit(X_train,presets="best_quality", **fit_args)
    y_pred = predictor.predict_proba(X_test, as_pandas=False)[:,1]
    y_pred_bin = [1 if y > 0.5 else 0 for y in y_pred]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

    # Accumulate true labels and predictions
    all_true_labels.extend(y_test)
    all_predictions.extend(y_pred_bin)

# Calculate and print the average classification report
print("\nAverage Classification Report Across All Folds:\n")
print(classification_report(all_true_labels, all_predictions))

# Calculate mean ROC curve and AUC
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

# Plot mean ROC curve with boundaries
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Cross-Validation ROC of AutoGluon', fontsize=18)
plt.legend(loc="lower right", prop={'size': 15})
plt.savefig(os.path.join(FIGUREPATH, "Padel_AutoGluon_600s.png"))

In [None]:
%%capture
#Ersilia embeddings
from eosce.models import ErsiliaCompoundEmbeddings

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
all_true_labels = []
all_predictions = []


for i in range(5):
    train_file = f"train_{i}.csv"
    test_file = f"test_{i}.csv"
    train = pd.read_csv(os.path.join(DATAPATH, "train_test_splits", train_file))
    test = pd.read_csv(os.path.join(DATAPATH, "train_test_splits",test_file))
    y_train = train["outcome"]
    y_test = test["outcome"]
    model = ErsiliaCompoundEmbeddings()
    X_train = model.transform(train["smiles"].tolist())
    X_train = pd.DataFrame(X_train, columns=["eosce_{}".format(i) for i in range(len(X_train[0]))])
    X_test = model.transform(test["smiles"].tolist())
    X_test = pd.DataFrame(X_test, columns=["eosce_{}".format(i) for i in range(len(X_test[0]))])
    X_train["outcome"] = y_train #add outcome again for the Tabular Predictor requirement

    fit_args = {}
    fit_args['time_limit'] =  600
    predictor = TabularPredictor(label="outcome").fit(X_train,presets="best_quality", **fit_args)
    y_pred = predictor.predict_proba(X_test, as_pandas=False)[:,1]
    y_pred_bin = [1 if y > 0.5 else 0 for y in y_pred]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

    # Accumulate true labels and predictions
    all_true_labels.extend(y_test)
    all_predictions.extend(y_pred_bin)

# Calculate and print the average classification report
print("\nAverage Classification Report Across All Folds:\n")
print(classification_report(all_true_labels, all_predictions))

# Calculate mean ROC curve and AUC
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

# Plot mean ROC curve with boundaries
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Cross-Validation ROC of AutoGluon', fontsize=18)
plt.legend(loc="lower right", prop={'size': 15})
plt.savefig(os.path.join(FIGUREPATH, "Ersilia_AutoGluon_600s.png"))

In [None]:
%%capture
# Morgan embeddings

from lazyqsar.descriptors.descriptors import MorganDescriptor

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
all_true_labels = []
all_predictions = []


for i in range(5):
    train_file = f"train_{i}.csv"
    test_file = f"test_{i}.csv"
    train = pd.read_csv(os.path.join(DATAPATH, "train_test_splits", train_file))
    test = pd.read_csv(os.path.join(DATAPATH, "train_test_splits",test_file))
    y_train = train["outcome"]
    y_test = test["outcome"]
    model = MorganDescriptor()
    model.fit(train["smiles"].tolist())
    X_train = model.transform(train["smiles"].tolist())
    X_test = model.transform(test["smiles"].tolist())
    X_train["outcome"] = y_train #add outcome again for the Tabular Predictor requirement

    fit_args = {}
    fit_args['time_limit'] =  600
    predictor = TabularPredictor(label="outcome").fit(X_train,presets="best_quality", **fit_args)
    y_pred = predictor.predict_proba(X_test, as_pandas=False)[:,1]
    y_pred_bin = [1 if y > 0.5 else 0 for y in y_pred]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

    # Accumulate true labels and predictions
    all_true_labels.extend(y_test)
    all_predictions.extend(y_pred_bin)

# Calculate and print the average classification report
print("\nAverage Classification Report Across All Folds:\n")
print(classification_report(all_true_labels, all_predictions))

# Calculate mean ROC curve and AUC
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

# Plot mean ROC curve with boundaries
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Cross-Validation ROC of AutoGluon', fontsize=18)
plt.legend(loc="lower right", prop={'size': 15})
plt.savefig(os.path.join(FIGUREPATH, "Morgan_AutoGluon_600s.png"))

# Final Models

In [2]:
# train and save final models, temptatively
import lazyqsar as lq

train_set = pd.read_csv(os.path.join(DATAPATH, "training_set_clean.csv"))
X_train = train_set["smiles"]
y_train = train_set["outcome"]

# Fit the model on the training set
model = lq.ErsiliaBinaryClassifier(time_budget_sec=600, estimator_list=["rf", "lgbm", "xgboost"])
model.fit(X_train, y_train)
model.save(os.path.join(MODELPATH, "ersilia_lq.joblib"))

[flaml.automl.logger: 02-06 12:53:14] {1679} INFO - task = classification
[flaml.automl.logger: 02-06 12:53:14] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 02-06 12:53:14] {1788} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 02-06 12:53:14] {1900} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'xgboost']
[flaml.automl.logger: 02-06 12:53:14] {2218} INFO - iteration 0, current learner rf
[flaml.automl.logger: 02-06 12:53:14] {2344} INFO - Estimated sufficient time budget=879s. Estimated necessary time budget=1s.
[flaml.automl.logger: 02-06 12:53:14] {2391} INFO -  at 0.1s,	estimator rf's best error=0.4151,	best estimator rf's best error=0.4151
[flaml.automl.logger: 02-06 12:53:14] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 02-06 12:53:15] {2391} INFO -  at 0.6s,	estimator lgbm's best error=0.3677,	best estimator lgbm's best error=0.3677
[flaml.automl.logger: 02-06 12:53:15] {2218} INFO - iteration 2, current learner xg

In [4]:
from eosce.models import ErsiliaCompoundEmbeddings
from autogluon.tabular import TabularPredictor

train_set = pd.read_csv(os.path.join(DATAPATH, "training_set_clean.csv"))
y_train = train_set["outcome"]

model = ErsiliaCompoundEmbeddings()
X_train = model.transform(train_set["smiles"].tolist())
X_train = pd.DataFrame(X_train, columns=["eosce_{}".format(i) for i in range(len(X_train[0]))])
X_train["outcome"] = y_train #add outcome again for the Tabular Predictor requirement

fit_args = {}
fit_args['time_limit'] = 600
predictor = TabularPredictor(label="outcome").fit(X_train,presets="best_quality", **fit_args)

No path specified. Models will be saved in: "AutogluonModels/ag-20240206_123841"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240206_123841/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 150s
AutoGluon will save models to "AutogluonModels/ag-20240206_123841/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.11.5
Operating System:   Li

In [10]:
from autogluon.tabular import TabularPredictor

df = pd.read_csv(os.path.join(DATAPATH, "training_set_padel.csv"))
#impute any missing values
for column in df.columns[3:]:
    column_mean = df[column].mean()
    df[column].fillna(column_mean, inplace=True)

#change infinity values
df.replace([np.inf, -np.inf], 1e6, inplace=True)

X_train = df.iloc[:, 2:]

fit_args = {}
fit_args['time_limit'] = 600
predictor = TabularPredictor(label="outcome").fit(X_train,presets="best_quality", **fit_args)

No path specified. Models will be saved in: "AutogluonModels/ag-20240206_133823"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240206_133823/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 150s
AutoGluon will save models to "AutogluonModels/ag-20240206_133823/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.11.5
Operating System:   Li