# Imports

In [1]:
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from asapdiscovery.docking.analysis import calculate_rmsd_openeye
from asapdiscovery.data.schema.ligand import Ligand
from asapdiscovery.data.readers.molfile import MolFileFactory
from asapdiscovery.modeling.protein_prep import ProteinPrepper
import plotly.express as px

# Load Data

In [31]:
from asapdiscovery.modeling.protein_prep import ProteinPrepper
complexes = ProteinPrepper.load_cache("/data/chodera/asap-datasets/mpro_fragalysis-04-01-24_curated_cache")

In [35]:
cmpd_to_frag_dict = {c.ligand.compound_name: c.target.target_name for c in complexes}

In [15]:
original_ligand_path = Path("/data/chodera/asap-datasets/mpro_fragalysis-04-01-24_curated_cache/combined_3d.sdf")

In [16]:
mff = MolFileFactory(filename=original_ligand_path)
ligs = mff.load()

In [17]:
lig_dict = {lig.compound_name: lig for lig in ligs}

In [2]:
results_paths = list(Path("/lila/data/chodera/asap-datasets/retro_docking/sars_fragalysis_retrospective/20240424_multi_pose_docking_cross_docking").glob("*.csv"))

In [3]:
dfs = [pd.read_csv(_csv) for _csv in results_paths]

In [4]:
df = pd.concat(dfs)

In [29]:
DO_NOT_EDIT = df.copy()

# Basic Analysis

## What is the distribution of targets included in each ligand

In [6]:
target_distribution = df.groupby(["Query_Ligand"]).nunique()

In [9]:
fig = px.histogram(target_distribution, x="Reference_Ligand")
fig.write_image("target_distribution.png")

# What is the distribution of ligands successfully docked per target?

In [10]:
ligand_distribution = df.groupby(["Reference_Ligand"]).nunique()

In [11]:
fig = px.histogram(ligand_distribution, x="Query_Ligand")
fig.write_image("ligand_distribution.png")

## are all targets and ligands represented?

In [18]:
original_lig_ids = set(lig_dict.keys())

In [19]:
original_lig_ids - set(df.Query_Ligand)

set()

In [20]:
original_lig_ids - set(df.Reference_Ligand)

{'BRU-THA-92256091-17', 'PET-UNK-8df914d1-2', 'RAL-THA-8416115c-13'}

In [13]:
set(df.Query_Ligand) - set(df.Reference_Ligand)

{'BRU-THA-92256091-17', 'PET-UNK-8df914d1-2', 'RAL-THA-8416115c-13'}

In [14]:
set(df.Reference_Ligand) - set(df.Query_Ligand)

set()

### so 3 structures are never docked to:
'BRU-THA-92256091-17', 'PET-UNK-8df914d1-2', 'RAL-THA-8416115c-13'

In [22]:
missing_refs = [lig_dict[missing] for missing in original_lig_ids - set(df.Reference_Ligand)]

## what pairs are present?

In [43]:
refs = df.Reference_Ligand
queries = df.Query_Ligand
pairs = {(ref, query) for ref, query in zip(refs,queries)}

In [44]:
len(pairs)

33429

In [45]:

possible_pairs = {(ref, query) for ref, query in zip(lig_dict.keys(), lig_dict.keys())}

In [47]:
len(possible_pairs)

205

In [49]:
from itertools import permutations
possible_pairs = permutations(lig_dict.keys())

In [50]:
list(possible_pairs

<itertools.permutations at 0x2b7a48fd9f30>

# Adding null results

# Add Query_Structure 

In [38]:
df["Query_Structure"] = df.Query_Ligand.apply(lambda x: cmpd_to_frag_dict[x])

# Dataset Split