In [None]:
import os
import glob
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from lightning import pytorch as pl
from chemprop import data, featurizers, models, nn

def find_dataset_path(dataset_name):
    current_directory = os.getcwd()
    base_directory = os.path.abspath(os.path.join(current_directory, ".."))
    
    dataset_pattern = os.path.join(base_directory, "**", f"{dataset_name}.csv")
    matches = glob.glob(dataset_pattern, recursive=True)
    
    if matches:
        return matches[0]
    else:
        return None

dataset_name = input("Please enter the dataset name (e.g., 'CYP1A2-Substrate'): ")
smiles_column = input("Please enter column with structures: ")
target_column = input("Please enter target column: ")
targets_column = [target_column]

input_path = find_dataset_path(dataset_name)

if input_path is None:
    print(f"Dataset '{dataset_name}' not found.")
else:
    print(f"Dataset found at: {input_path}")

    num_folds = 10
    num_workers = 0

    df_input = pd.read_csv(input_path)
    mean = df_input[target_column].mean()

    smis = df_input[smiles_column].values
    ys = df_input[targets_column].values

    all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    metrics = []

    for train_index, test_index in kf.split(all_data):
        train_data = [all_data[i] for i in train_index]
        test_data = [all_data[i] for i in test_index]

        featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
        train_dset = data.MoleculeDataset(train_data, featurizer)
        test_dset = data.MoleculeDataset(test_data, featurizer)

        train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
        test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

        mp = nn.BondMessagePassing()
        agg = nn.MeanAggregation()
        ffn = nn.BinaryClassificationFFN()

        mpnn = models.MPNN(mp, agg, ffn, batch_norm=True)

        trainer = pl.Trainer(
            logger=True,
            enable_checkpointing=True,
            enable_progress_bar=True,
            accelerator="cpu",
            devices=1,
            max_epochs=50,
        )

        trainer.fit(mpnn, train_loader)

        predictions = trainer.predict(mpnn, test_loader)
        predictions = np.concatenate(predictions)
        true_labels = ys[test_index]

        preds_binary = (predictions > mean).astype(int)
        tn, fp, fn, tp = confusion_matrix(true_labels, preds_binary).ravel()

        metrics.append({
            'TP': tp,
            'TN': tn,
            'FP': fp,
            'FN': fn,
            'Sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0,
            'Specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
            'Accuracy': accuracy_score(true_labels, preds_binary),
            'Balanced Accuracy': balanced_accuracy_score(true_labels, preds_binary),
        })

    metrics_df = pd.DataFrame(metrics)

    average_metrics = metrics_df.mean().to_frame(name='Average').T

    print(average_metrics)

Please enter the dataset name (e.g., 'CYP1A2-Substrate'):  cyp1a2-inhibitor
Please enter column with structures:  smiles
Please enter target column:  Activity


Dataset found at: /Users/aleksashka/admetica/metabolism/cyp1a2-inhibitor/cyp1a2-inhibitor.csv


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
Loading `train_dataloader` to estimate number of stepping batches.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.

  | Name            | Type                    | Params | Mode 
--------------------------------------------------------------------
0 | message_passing | BondMess

Training: |                                                                                                   …

`Trainer.fit` stopped: `max_epochs=50` reached.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predicting: |                                                                                                 …

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
Loading `train_dataloader` to estimate number of stepping batches.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.

  | Name            | Type                    | Params | Mode 
--------------------------------------------------------------------
0 | message_passing | BondMess

Training: |                                                                                                   …

`Trainer.fit` stopped: `max_epochs=50` reached.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predicting: |                                                                                                 …

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
Loading `train_dataloader` to estimate number of stepping batches.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.

  | Name            | Type                    | Params | Mode 
--------------------------------------------------------------------
0 | message_passing | BondMess

Training: |                                                                                                   …