# Predicting

In [31]:
import pandas as pd
import numpy as np
import torch
from lightning import pytorch as pl
from pathlib import Path

from chemprop import data, featurizers, models
import chempropstereo

In [32]:
tested_models = [
    "chemprop",
    "chempropstereo_diverge",
    "chempropstereo_converge",
]

num_runs = 5

In [33]:
mpnns = {}
for model in tested_models:
    mpnns[model] = []
    for run in range(num_runs):
        checkpoint_dir = Path.cwd() / "checkpoints" / model / "V2" / f"run{run}"
        # checkpoint_path = next(checkpoint_dir.glob("best-epoch=*.ckpt"))
        checkpoint_path = checkpoint_dir / "last.ckpt"
        mpnns[model].append(models.MPNN.load_from_checkpoint(checkpoint_path))

In [34]:
featurizers = {
    "chemprop": featurizers.SimpleMoleculeMolGraphFeaturizer(),
    "chempropstereo_diverge": chempropstereo.featurizers.MoleculeStereoFeaturizer(
        mode="V2", divergent_bonds=True
    ),
    "chempropstereo_converge": chempropstereo.featurizers.MoleculeStereoFeaturizer(
        mode="V2", divergent_bonds=False
    ),
}

## Load model

In [None]:
input_path = Path.cwd() / "ld_classification_dataset.csv.gz"
df_input = pd.read_csv(input_path, compression="gzip")
df_input

## Load test smiles

In [None]:
df_test = [
    df_input[df_input[f"split_{run}"] == "test"]
    for run in range(num_runs)
]
df_test[0]

## Get molecule dataset

In [None]:
predictions = {}
for model in tested_models:
    featurizer = featurizers[model]
    predictions[model] = {}
    with torch.inference_mode():
        trainer = pl.Trainer(
            logger=None,
            enable_progress_bar=True,
            accelerator="gpu",
            devices=1
        )
        for run in range(num_runs):
            print(f"Testing {model} run {run}")
            smis = df_test[run]["smiles"]
            test_data = [data.MoleculeDatapoint.from_smi(smi) for smi in smis]
            test_dset = data.MoleculeDataset(test_data, featurizer=featurizer)
            test_loader = data.build_dataloader(test_dset, shuffle=False)
            predictions[model][run] = trainer.predict(mpnns[model][run], test_loader)

In [29]:
accuracy = {}
for model in tested_models:
    accuracies = []
    for run in range(num_runs):
        true_values = df_test[run]["sign_rotation"].values.astype(int)
        preds = np.round(np.concatenate(predictions[model][run]).ravel()).astype(int)
        accuracies.append(sum(true_values == preds) / len(true_values))
    accuracy[model] = {"mean": np.mean(accuracies), "std": np.std(accuracies)}

In [None]:
accuracy_df = pd.DataFrame(accuracy).T
accuracy_df