# Imports

In [None]:
from tqdm import tqdm
import pandas as pd, numpy as np
import plotly.express as px
from plotly.graph_objs import Figure
from pathlib import Path
from importlib import reload
import software.analysis as a
from asapdiscovery.data.schema.ligand import Ligand
reload(a)

# Load Paths

In [None]:
datadir = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/20231030_cross_docked_hybrid_p_only_2d_50poses/")

In [None]:
csvs = [pd.read_csv(csv) for csv in tqdm(datadir.glob("*.csv"))]

In [None]:
df = pd.concat(csvs)

In [None]:
len(df)

In [None]:
results_count = df.groupby(["ligand_id", "du_structure"]).count()['docked_file']

# Get unique structures

In [None]:
structs = df.du_structure.unique()

In [None]:
len(structs)

In [None]:
sdf_dir = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/full_frag_prepped_mpro_20230603/sdf_lsf_array_p_only_by_name/")

In [None]:
ligs = [Ligand.from_sdf(sdf) for sdf in sdf_dir.glob("*.sdf")]

In [None]:
lig_dict = {lig.tags["Dataset"]: lig for lig in ligs}

## load docked mols

In [None]:
docked_sdfs = [path for path in Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/20231030_cross_docked_hybrid_p_only_2d_50poses/").glob("*/*.sdf")]

## since these are multipose sdf files, I'll use a MolFileFactory

In [None]:
from asapdiscovery.data.readers.molfile import MolFileFactory

In [None]:
shortened_list = docked_sdfs[0:10]

In [None]:
docked_ligs = [ligand for sdf in tqdm(docked_sdfs) for ligand in MolFileFactory(filename=sdf).load()]

In [None]:
len(docked_sdfs)

In [None]:
f"Mpro{docked_ligs[0].compound_name.split('_Mpro')[1]}"

In [None]:
from asapdiscovery.docking.analysis import calculate_rmsd_openeye

## calculate RMSDs

In [None]:
compound_ids = []
datasets = []
rmsds = []
for lig in tqdm(docked_ligs):
    dataset = f"Mpro{lig.compound_name.split('_Mpro')[1]}"
    ref_lig = lig_dict[lig.tags["Dataset"]]
    compound_id = lig.tags["Compound_ID"]
    compound_ids.append(compound_id)
    datasets.append(dataset)
    rmsds.append(calculate_rmsd_openeye(ref_lig.to_oemol(), lig.to_oemol()))

In [None]:
rmsd_df = pd.DataFrame({"Compound_ID": compound_ids, "Dataset": datasets, "RMSD": rmsds})

In [None]:
rmsd_df[(rmsd_df["Compound_ID"] =="ADA-UCB-6c2cb422-1") & (rmsd_df["Dataset"] == "Mpro-P0008_0A")]

In [None]:
rmsd_df.groupby(["Compound_ID", "Dataset"]).count()

In [None]:
rmsd_df["Pose_ID"] = rmsd_df.groupby(["Compound_ID", "Dataset"]).cumcount()

In [None]:
rmsd_df.columns = ["ligand_id", "du_structure", "rmsd", "pose_id"]

In [None]:
results_count

In [None]:
rmsd_count = rmsd_df.groupby(["ligand_id", "du_structure"]).count()["rmsd"]

In [None]:
rmsd_count

In [None]:
rmsd_df.nunique()

In [None]:
df.nunique()

## save the RMSD data

In [None]:
rmsd_df.to_csv("calculated_rmsds.csv")

# make dataframe of data from SD tags

In [None]:
shortened_list = docked_ligs[0:10]
smiles = []
clash = []
chemgauss4 = []
posit_method = []
posit_score = []

for lig in tqdm(docked_ligs):
    smiles.append(lig.tags["SMILES"])
    clash.append(lig.tags["Docking_posit_hybrid_clash_clash"])
    chemgauss4.append(lig.tags["Docking_posit_hybrid_clash_Chemgauss4"])
    posit_method.append(lig.tags["Docking_posit_hybrid_clash_POSIT_method"])
    posit_score.append(lig.tags["Docking_posit_hybrid_clash_POSIT"])

In [None]:
data_df = pd.DataFrame({"SMILES": smiles, 
                        "Chemgauss4": chemgauss4, 
                        "Clash": clash, 
                        "POSIT_Method": posit_method, 
                        "POSIT": posit_score,
                       "Compound_ID": compound_ids,
                       "Dataset": datasets,
                       "Complex_ID": [f"{cmpd}_{dataset}" for cmpd, dataset in zip(compound_ids, datasets)],
                        "RMSD": rmsds
                       })

In [None]:
data_df["Pose_ID"] = data_df.groupby(["Complex_ID"]).cumcount()

### and save it

In [None]:
data_df.to_csv("rmsd_data_with_all_info.csv")