# Imports

In [None]:
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from asapdiscovery.docking.analysis import calculate_rmsd_openeye
from asapdiscovery.data.schema.ligand import Ligand
from asapdiscovery.data.readers.molfile import MolFileFactory
from asapdiscovery.modeling.protein_prep import ProteinPrepper
import plotly.express as px

# Load Data

In [None]:
from asapdiscovery.modeling.protein_prep import ProteinPrepper
complexes = ProteinPrepper.load_cache("/data/chodera/asap-datasets/mpro_fragalysis-04-01-24_curated_cache")

In [None]:
cmpd_to_frag_dict = {c.ligand.compound_name: c.target.target_name for c in complexes}

In [None]:
original_ligand_path = Path("/data/chodera/asap-datasets/mpro_fragalysis-04-01-24_curated_cache/combined_3d.sdf")

In [None]:
mff = MolFileFactory(filename=original_ligand_path)
ligs = mff.load()

In [None]:
lig_dict = {lig.compound_name: lig for lig in ligs}

In [None]:
results_paths = list(Path("/lila/data/chodera/asap-datasets/retro_docking/sars_fragalysis_retrospective/20240424_multi_pose_docking_cross_docking").glob("*.csv"))

In [None]:
dfs = [pd.read_csv(_csv) for _csv in results_paths]

In [None]:
df = pd.concat(dfs)

In [None]:
DO_NOT_EDIT = df.copy()

# Basic Analysis

## What is the distribution of targets included in each ligand

In [None]:
target_distribution = df.groupby(["Query_Ligand"]).nunique()

In [None]:
fig = px.histogram(target_distribution, x="Reference_Ligand")
fig.write_image("target_distribution.png")

# What is the distribution of ligands successfully docked per target?

In [None]:
ligand_distribution = df.groupby(["Reference_Ligand"]).nunique()

In [None]:
fig = px.histogram(ligand_distribution, x="Query_Ligand")
fig.write_image("ligand_distribution.png")

## are all targets and ligands represented?

In [None]:
original_lig_ids = set(lig_dict.keys())

In [None]:
original_lig_ids - set(df.Query_Ligand)

In [None]:
original_lig_ids - set(df.Reference_Ligand)

In [None]:
set(df.Query_Ligand) - set(df.Reference_Ligand)

In [None]:
set(df.Reference_Ligand) - set(df.Query_Ligand)

### so 3 structures are never docked to:
'BRU-THA-92256091-17', 'PET-UNK-8df914d1-2', 'RAL-THA-8416115c-13'

In [None]:
missing_refs = [lig_dict[missing] for missing in original_lig_ids - set(df.Reference_Ligand)]

## what pairs are present?

In [None]:
refs = df.Reference_Ligand
queries = df.Query_Ligand
pairs = {(ref, query) for ref, query in zip(refs,queries)}

In [None]:
len(pairs)

In [None]:

possible_pairs = {(ref, query) for ref, query in zip(lig_dict.keys(), lig_dict.keys())}

In [None]:
len(possible_pairs)

In [None]:
from itertools import permutations
possible_pairs = set(list(permutations(lig_dict.keys(), 2)))

In [None]:
len(possible_pairs)

# Adding null results

In [None]:
missing_pairs = possible_pairs - pairs

In [None]:
len(missing_pairs)

In [None]:
null_df = pd.DataFrame({"Query_Ligand": [i for i,j in missing_pairs], 
                        "Reference_Ligand":[j for i,j in missing_pairs],
                       "RMSD": np.nan,
                       "Pose_ID": 0,
                       "POSIT_Method": "Failed"})

In [None]:
null_df

## add structures

In [None]:
null_df["Reference_Structure"] = null_df.Reference_Ligand.apply(lambda x: cmpd_to_frag_dict[x])

## concat null results

In [None]:
padded = pd.concat([df, null_df])

In [None]:
df = padded.copy()

# Add Query_Structure 

## make sure the references structures match?

In [None]:
all(df["Reference_Structure"] == df.Reference_Ligand.apply(lambda x: cmpd_to_frag_dict[x]))

## they do, good

In [None]:
df["Query_Structure"] = df.Query_Ligand.apply(lambda x: cmpd_to_frag_dict[x])

# lets go ahead and save this

In [None]:
df.to_csv("20240502_combined_results_with_rmsd_and_null_results.csv")