# Imports

In [None]:
from pathlib import Path
from asapdiscovery.data.openeye import load_openeye_sdfs, oechem, save_openeye_sdfs
from asapdiscovery.data.fragalysis import parse_fragalysis
import numpy as np
import pandas as pd, numpy as np
import plotly.express as px
from datetime import datetime
from tqdm.notebook import tqdm
from asapdiscovery.docking.analysis import DockingResults
import asapdiscovery.data.openeye as oe
import asapdiscovery.modeling.cheminformatics as ci
from importlib import reload

# Load Paths

In [None]:
import sys
sys.path.append(str(Path("../../../").resolve()))
from software.paths import paths

In [None]:
local_analysis = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/")

In [None]:
full_posit = local_analysis / "results_with_structure_dates.csv"
hybrid = local_analysis / "results_cleaned_tc_scores.csv"

In [None]:
posit_df = pd.read_csv(full_posit, index_col=0)

In [None]:
len(posit_df)

In [None]:
hybrid_df = pd.read_csv(hybrid, index_col=0)

In [None]:
len(hybrid_df)

## why does posit_df have more?

In [None]:
np.shape(posit_df)

In [None]:
posit_df.nunique()

In [None]:
_ = posit_df.groupby("Complex_ID")["Complex_ID"].count()
dups = _[_ > 1]

In [None]:
dups.index

In [None]:
posit_df[posit_df.Complex_ID == 'ADA-UCB-6c2cb422-1_Mpro-P0008_0A'].nunique()

In [None]:
dups_removed = posit_df.drop_duplicates()

In [None]:
dups_removed[dups_removed.Complex_ID == 'ADA-UCB-6c2cb422-1_Mpro-P0008_0A']

# Remove duplicates just to be sure

In [None]:
np.shape(dups_removed)

In [None]:
hybrid_dups_removed = hybrid_df.drop_duplicates()

In [None]:
np.shape(hybrid_dups_removed)

# now get the compound info

In [None]:
tidy_columns = ["Docked_File", "RMSD", "POSIT", "POSIT_Method", "Chemgauss4", "Clash", "POSIT_R"]

In [None]:
complex_info_cols = [col for col in hybrid_dups_removed.columns if col not in tidy_columns]
hybrid_complex_info = hybrid_dups_removed.loc[:, complex_info_cols]
hybrid_complex_info["Version"] = "Hybrid-Only"

complex_info_cols = [col for col in dups_removed.columns if col not in tidy_columns]
posit_complex_info = dups_removed.loc[:, complex_info_cols]
posit_complex_info["Version"] = "All"

In [None]:
hybrid_to_concat = hybrid_dups_removed.loc[:, tidy_columns + ["Complex_ID"]]
hybrid_to_concat["Version"] = "Hybrid-Only"

posit_to_concat = dups_removed.loc[:, tidy_columns + ["Complex_ID"]]
posit_to_concat["Version"] = "All"

In [None]:
on_cols = [col for col in posit_complex_info.columns if col in hybrid_complex_info.columns and col not in ["Version"]]
outer_info = pd.merge(posit_complex_info, hybrid_complex_info, on=on_cols + ["Version"], how="outer")

In [None]:
inner_info = pd.merge(posit_complex_info, hybrid_complex_info, on=on_cols, how="inner")

In [None]:
np.shape(outer_info)

In [None]:
np.shape(inner_info)

In [None]:
outer_info.nunique()

In [None]:
_ = outer_info.groupby("Complex_ID")["Complex_ID"].count()
dups = _[_ > 1]

In [None]:
dups

In [None]:
outer_info[outer_info.Complex_ID == "ALP-POS-133e7cd9-2_Mpro-P0010_0A"].nunique()

### THE SMILES STRING IS THE CULPRIT

In [None]:
print([smiles for smiles in outer_info[outer_info.Complex_ID == "ALP-POS-133e7cd9-2_Mpro-P0010_0A"].SMILES])

In [None]:
'c1ccc2c(c1)cncc2N3CCC[C@@]4(C3=O)C[N@@](Cc5c4cc(cc5)Cl)S(=O)(=O)CC6(CC6)C#N'
'c1ccc2c(c1)cncc2N3CCC[C@@]4(C3=O)CN(Cc5c4cc(cc5)Cl)S(=O)(=O)CC6(CC6)C#N'

In [None]:
outer_info[outer_info.Complex_ID == "ALP-POS-133e7cd9-2_Mpro-P0010_0A"]

## for some reason the smiles string from full posit is better (includes stereochem)

## in that case, lets drop SMILES from the hybrid df and then merge

In [None]:
complex_info_cols = [col for col in hybrid_dups_removed.columns if col not in tidy_columns]
hybrid_complex_info = hybrid_dups_removed.loc[:, complex_info_cols]
# hybrid_complex_info["Version"] = "Hybrid-Only"

complex_info_cols = [col for col in dups_removed.columns if col not in tidy_columns]
posit_complex_info = dups_removed.loc[:, complex_info_cols]
# posit_complex_info["Version"] = "All"

In [None]:
hybrid_complex_info_no_smiles = hybrid_complex_info.drop(columns=["SMILES"])

In [None]:
on_cols = [col for col in posit_complex_info.columns if col in hybrid_complex_info_no_smiles.columns and col not in ["Version"]]
outer_info = pd.merge(posit_complex_info, hybrid_complex_info_no_smiles, on=on_cols, how="outer")

In [None]:
inner_info = pd.merge(posit_complex_info, hybrid_complex_info_no_smiles, on=on_cols, how="inner")

In [None]:
np.shape(outer_info)

In [None]:
np.shape(inner_info)

In [None]:
outer_info.nunique()

In [None]:
214*219

## Good, now the number of unique complex_IDs == len(df) == len(Compound_ID) * len(Structure_Source)

# Is there anything missing?

In [None]:
outer_info.isna().sum()

## yes, there are Reference Ligands and Tanimoto scores missing

## Add Reference Compound

## load from yaml

In [None]:
from asapdiscovery.data.utils import get_compound_id_xtal_dicts
import yaml

In [None]:
with open("/Users/alexpayne/Scientific_Projects/covid-moonshot-ml/metadata/cmpd_to_frag.yaml") as f:
    cmpd_to_frag_dict = yaml.safe_load(f)

In [None]:
frag_to_cmpd_dict = {v:k for k,v in cmpd_to_frag_dict.items()}

In [None]:
reference_df = pd.DataFrame({"Compound_ID": cmpd_to_frag_dict.keys(), 
                           "Structure_Source": [source for source in cmpd_to_frag_dict.values()]})

In [None]:
Structure_Source = outer_info.Structure_Source.apply(lambda x: x.split("_")[0])

In [None]:
ref_lig = Structure_Source.apply(lambda x: frag_to_cmpd_dict[x])

## add to compound info df

In [None]:
outer_info["Reference_Ligand"] = ref_lig

In [None]:
outer_info.isna().sum()

# Add TC score

In [None]:
from asapdiscovery.modeling.cheminformatics import get_n_to_n_tanimoto

## load mols

In [None]:
sdfs = load_openeye_sdfs(str(paths.combined_p_only_sdf))

## Calculate Tanimoto

In [None]:
tc_df = ci.get_n_to_n_tanimoto(sdfs, sdfs)

In [None]:
outer_info.drop(columns=["TanimotoCombo"], inplace=True)

In [None]:
complex_info_complex_tc = pd.merge(outer_info, tc_df, on=["Compound_ID", "Reference_Ligand"])

In [None]:
np.shape(outer_info)

In [None]:
np.shape(complex_info_complex_tc)

## why are we losing information?

In [None]:
set(outer_info.Compound_ID) - set(tc_df.Compound_ID)

In [None]:
set(tc_df.Compound_ID) - set(outer_info.Compound_ID)

In [None]:
outer_info.Compound_ID = outer_info.Compound_ID.apply(lambda x: x.strip(" "))

In [None]:
outer_info.Complex_ID = outer_info.Complex_ID.replace(" ", "")

In [None]:
set(outer_info.Compound_ID) - set(tc_df.Compound_ID)

In [None]:
set(tc_df.Compound_ID) - set(outer_info.Compound_ID)

In [None]:
set(tc_df.Reference_Ligand) - set(outer_info.Reference_Ligand)

In [None]:
set(outer_info.Reference_Ligand) - set(tc_df.Reference_Ligand)

In [None]:
compound_info_with_merged_tc = pd.merge(tc_df, outer_info, on=["Reference_Ligand", "Compound_ID"])

In [None]:
set(tc_df.Reference_Ligand) - set(tc_df.Compound_ID)

In [None]:
set(tc_df.Compound_ID) - set(tc_df.Reference_Ligand)

In [None]:
tc_df[tc_df.Compound_ID.isin(set(outer_info.Reference_Ligand) - set(tc_df.Reference_Ligand))]

In [None]:
cmpd_ids = [mol.GetTitle() for mol in sdfs]

In [None]:
set(outer_info.Reference_Ligand) - set(cmpd_ids)

In [None]:
outer_info[outer_info.Reference_Ligand.isin(set(outer_info.Reference_Ligand) - set(cmpd_ids))]

In [None]:
outer_info.Reference_Ligand = outer_info.Reference_Ligand.apply(lambda x: x.strip(" "))

In [None]:
missing = set(outer_info.Reference_Ligand) - set(cmpd_ids)

In [None]:
missing

In [None]:
lig1 = "EDG-MED-971238d3-4"
smi1 = "N[C@@]1(C(=O)Nc2cncc3ccccc23)CCOc2ccc(Cl)cc21"
lig2 = "MAT-POS-3b97339c-2" 

In [None]:
outer_info[outer_info.Reference_Ligand == lig2]

In [None]:
"EDG-MED-971238d3-4" in cmpd_ids

In [None]:
mol = [mol for mol in sdfs if mol.GetTitle() == "EDG-MED-971238d3-4"][0]

In [None]:
mol2 = [mol for mol in sdfs if mol.GetTitle() == "MAT-POS-3b97339c-2"][0]

In [None]:
from asapdiscovery.data.openeye import oechem

In [None]:
oe.oedepict.OEPrepareDepiction(mol)
disp = oe.oedepict.OE2DMolDisplay(mol)

In [None]:
out_fn = "EDG-MED-971238d3-4.png"

In [None]:
oe.oedepict.OERenderMolecule(out_fn, disp)

# Ok so the problem here is that the Mpro_combined sdf has potentially incorrect names for the molecules

https://asapdiscovery.slack.com/archives/C03GZL9D8UD/p1686512398903609

| compound tracker | structure | metadata | Suspected_ID pointer|
|-|-|-|-|
| BEN-DND-4f474d93-1 | Mpro-P0012 | ALP-POS-ce760d3f-2 | no |
| MAT-POS-3b97339c-2 | Mpro-P0208 | EDG-MED-971238d3-4 | no |
|EDG-MED-5d232de5-4 | Mpro-P0148 | EDG-MED-5d232de5-3| yes |
|EDG-MED-5d232de5-5| Mpro-P0171 | EDG-MED-5d232de5-6 | yes |
| VLA-UNK-cf7facf1-1 | Mpro-P0143 | VLA-UCB-34f3ed0c-11 | yes |

In [None]:
missing_mols = list(missing)

In [None]:
missing_mols

In [None]:
correct_mols = ['ALP-POS-ce760d3f-2',
                'EDG-MED-5d232de5-3',
                'VLA-UCB-34f3ed0c-11',
                'EDG-MED-5d232de5-6',
                'EDG-MED-971238d3-4',
               ]

In [None]:
id_correction_dict = {wrong: right for wrong, right in zip(missing, correct_mols)}

In [None]:
id_correction_dict

In [None]:
new_ref_lig = outer_info.Reference_Ligand.apply(lambda x: id_correction_dict.get(x, x))

In [None]:
set(new_ref_lig) - set(tc_df.Reference_Ligand)

In [None]:
set(tc_df.Reference_Ligand) - set(new_ref_lig)

## ok now it looks like we can get the correct ref ligs

In [None]:
outer_info.Reference_Ligand = new_ref_lig

In [None]:
outer_info.nunique()

In [None]:
set(outer_info.Compound_ID) - set(outer_info.Reference_Ligand)

In [None]:
set(outer_info.Reference_Ligand) - set(outer_info.Compound_ID)

# Ok add the tc score correctly this time

In [None]:
compound_info_with_tc = pd.merge(outer_info, tc_df, on=["Reference_Ligand", "Compound_ID"])

# Now lets make a new combined df!!

In [None]:
tidy_df = pd.concat([hybrid_to_concat, posit_to_concat])

In [None]:
set(tidy_df.Complex_ID) - set(compound_info_with_tc.Complex_ID)

In [None]:
set(compound_info_with_tc.Complex_ID) - set(tidy_df.Complex_ID)

In [None]:
tidy_df.nunique()

In [None]:
np.shape(tidy_df)

In [None]:
len(tidy_df.Complex_ID.unique()) * 2

In [None]:
tidy_df.groupby("Version").nunique()

In [None]:
tidy_df.groupby("Version").apply(lambda x: x.isna().sum())

In [None]:
set(tidy_df.Complex_ID) - set(outer_info.Complex_ID)

In [None]:
set(outer_info.Complex_ID) - set(tidy_df.Complex_ID)

In [None]:
merged = pd.merge(tidy_df, compound_info_with_tc, on="Complex_ID")

In [None]:
merged.nunique()

In [None]:
np.shape(merged)

In [None]:
merged.to_csv(local_analysis / "20230611-combined.csv")