# Imports

In [None]:
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from asapdiscovery.docking.analysis import calculate_rmsd_openeye
from asapdiscovery.data.schema.ligand import Ligand
from asapdiscovery.data.readers.molfile import MolFileFactory

# Load the data

In [None]:
ogpath = Path("/data/chodera/asap-datasets/mpro_fragalysis-04-01-24_curated_cache/combined_3d.sdf")

In [None]:
posed_path = Path("/lila/data/chodera/asap-datasets/retro_docking/sars_fragalysis_retrospective/20240424_multi_pose_docking_cross_docking")

In [None]:
posed_paths = list(posed_path.glob("20240424_multi_pose_docking_cross_docking*/docking_results.sdf"))

In [None]:
mff = MolFileFactory(filename=ogpath)
ligs = mff.load()

In [None]:
len(ligs)

In [None]:
lig_dict = {lig.compound_name:lig for lig in ligs}

In [None]:
from typing import Union
from asapdiscovery.data.backend.openeye import oechem
def load_openeye_sdfs(sdf_fn: Union[str, Path]) -> [oechem.OEMol]:
    """
    Load an OpenEye SDF file and return it as an OpenEye OEMol object.
    Reads multiple conformers into the OEMol object but if the sdf file contains
    multiple molecules, it will only return the first one.

    Parameters
    ----------
    sdf_fn : Union[str, Path]
        Path to the SDF file to load.

    Returns
    -------
    oechem.OEMol
        An OpenEye OEMol object containing the molecule data from the SDF file.

    Raises
    ------
    FileNotFoundError
        If the specified file does not exist.
    oechem.OEError
        If the SDF file cannot be opened.

    Notes
    -----
    This function assumes that the SDF file contains a single molecule. If the
    file contains more than one molecule, only the first molecule will be loaded.
    """

    if not Path(sdf_fn).exists():
        raise FileNotFoundError(f"{sdf_fn} does not exist!")

    ifs = oechem.oemolistream()
    ifs.SetFlavor(
        oechem.OEFormat_SDF,
        oechem.OEIFlavor_SDF_Default,
    )
    ifs.SetConfTest(oechem.OEOmegaConfTest())
    mol_list = []
    if ifs.open(str(sdf_fn)):
        for mol in ifs.GetOEMols():
            mol_list.append(mol.CreateCopy())
        ifs.close()
        return mol_list
    else:
        oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}")

In [None]:
posed_ligs1 = [Ligand.from_oemol(mol) for mol in load_openeye_sdfs(posed_paths[0])]

In [None]:
posed_ligs1[0].compound_name

In [None]:
posed_ligs1[0].conf_tags.keys()

In [None]:
posed_lig = posed_ligs1[0]
posed_mol = posed_lig.to_oemol()

In [None]:
posed_lig.provenance

In [None]:
og_lig = lig_dict[posed_lig.compound_name]
og_mol = og_lig.to_oemol()

In [None]:
calculate_rmsd_openeye(og_mol, posed_mol)

In [None]:
posed_lig.num_poses

# right, this isn't going to work because there's nothing to distinguish which structure was used to docking, we'll have to load from the results objects

In [None]:
from asapdiscovery.docking.openeye import POSITDockingResults

In [None]:
json_paths = list(posed_path.glob("20240424_multi_pose_docking_cross_docking95/docking_results/*/docking_result.json"))

In [None]:
results = [POSITDockingResults.from_json_file(json_file) for json_file in json_paths]

In [None]:
posed_lig1 = results[0].posed_ligand

In [None]:
posed_lig1

In [None]:
og_lig = lig_dict[posed_lig1.compound_name]

In [None]:
def calculate_ligand_rmsd(ref: Ligand, fit: Ligand) -> list[float]:
    fitmol = fit.to_oemol()
    refmol = ref.to_oemol()
    nConfs = fit.num_poses
    vecRmsd = oechem.OEDoubleArray(nConfs)
    success = OERMSD(refmol, fitmol, vecRmsd)
    if not success:
        print("RMSD calculation failed")
    fit.set_SD_data({"RMSD": list(vecRmsd)})
    return fit

In [None]:
for result in tqdm(results):
    posed_lig = result.posed_ligand
    ref = lig_dict[posed_lig.compound_name]
    calculate_ligand_rmsd(ref, posed_lig)

In [None]:
result.posed_ligand

In [None]:
result

In [None]:
result.to_df()

In [None]:
result.input_pair.ligand

# make df from docking results

In [None]:
def make_df_from_docking_results(results = list[POSITDockingResults]):
    import pandas as pd
    from asapdiscovery.docking.docking_data_validation import DockingResultCols

    dfs = []
    for result in results:
        docking_dict = {}
        docking_dict["Query_Ligand"] = (
            result.input_pair.ligand.compound_name
        )
        docking_dict["Reference_Structure"] = (
            result.input_pair.complex.target.target_name
        )
        docking_dict["Reference_Ligand_SMILES"] = (
            result.input_pair.complex.ligand.smiles
        )
        docking_dict[DockingResultCols.SMILES.value] = (
            result.input_pair.ligand.smiles
        )
        docking_dict[DockingResultCols.DOCKING_CONFIDENCE_POSIT.value] = (
            result.posed_ligand.conf_tags["docking-confidence-POSIT"]
        )
        docking_dict["RMSD"] = result.posed_ligand.conf_tags["RMSD"]
        docking_dict["Pose_ID"] = result.posed_ligand.conf_tags['Pose_ID']
        docking_dict["POSIT_Method"] = result.posed_ligand.conf_tags['_POSIT_method']
        docking_dict["Reference_Ligand"] = result.input_pair.complex.ligand.compound_name
        
        dfs.append(pd.DataFrame(docking_dict))

    df = pd.concat(dfs)
    return df

In [None]:
make_df_from_docking_results(results)

# Ok this is about good enough, now I'll turn into a script and parallelize across ligands

In [None]:
result