# Goal

For about ~10% of the molecules, we never find the low RMSD pose even though it is there. I'd like to identify which molecules are problematic and why.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px

# Load the data

In [None]:
figure_path = Path("/Users/alexpayne/Scientific_Projects/asapdiscovery-sars-retrospective/science/20240403_multi_pose_docking_v2/20240430_analyze_cross_docking_results/figures")

In [None]:
results_csv = Path(
    "/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/20240424_multi_pose_docking_cross_docking/results_csvs/20240503_combined_results_with_data.csv")

In [None]:
df = pd.read_csv(results_csv, index_col=0)

In [None]:
df.columns

In [None]:
true_best = df.sort_values(["RMSD"]).groupby("Query_Ligand").head(1)

In [None]:
sum(true_best.RMSD < 2.0) / len(true_best)

In [None]:
posit_best = df.sort_values(["docking-confidence-POSIT"], ascending=False).groupby("Query_Ligand").head(1)

In [None]:
sum(posit_best.RMSD < 2.0) / len(posit_best)

In [None]:
failed = posit_best[posit_best.RMSD >=2.0]

In [None]:
failed

In [None]:
failed_ligs = set(failed.Query_Ligand)

In [None]:
rmsd_failed_ligs = set(true_best[true_best.RMSD >=2.0].Query_Ligand)

In [None]:
rmsd_failed_ligs

In [None]:
posit_failed_ligs = failed_ligs - rmsd_failed_ligs

as expected, there aren't any that failed in POSIT but not RMSD

In [None]:
rmsd_failed_ligs - failed_ligs

## Here are the ligands that failed in POSIT but not RMSD

In [None]:
posit_failed_ligs

In [None]:
failed_lig_df = df[df.Query_Ligand.isin(posit_failed_ligs)]

In [None]:
fig = px.histogram(failed_lig_df, 
                   x="RMSD", 
                   color="Query_Ligand", 
                   range_x=[0, 10],
                   height=400,
                   width=600,
                   template="plotly_white",)
fig.add_vline(x=2.0, line_dash="dash", line_color="black")
fig.show()
fig.write_image(figure_path / "20240606_failed_ligand_RMSD_histogram.png")

In [None]:
successful_ligs = list(set(df.Query_Ligand) - failed_ligs)

In [None]:
import random

In [None]:
for i in range(10):
    successful_sample = random.sample(successful_ligs, 13)
    success_lig_df = df[df.Query_Ligand.isin(successful_sample)]
    fig = px.histogram(success_lig_df, x="RMSD", color="Query_Ligand",range_x=[0, 10],
                   height=400,
                   width=600,
                   template="plotly_white",)
    fig.add_vline(x=2.0, line_dash="dash", line_color="black")
    fig.write_image(figure_path / f"20240606_successful_ligand_RMSD_histogram_example_{i}.png")

# Plot the ligands

In [None]:
smiles = failed_lig_df.SMILES 

In [None]:
from asapdiscovery.data.schema.ligand import Ligand

In [None]:
ligs = []
for row in failed_lig_df.groupby("Query_Ligand").head(1).iterrows():
    ligs.append(Ligand.from_smiles(row[1].SMILES, compound_name=row[1].Query_Ligand)) 

In [None]:
from harbor.plotting.ligands import plot_ligands_with_mcs

In [None]:
import mols2grid

In [None]:
mols2grid.__version__

In [None]:
grid = mols2grid.MolGrid(failed_lig_df.groupby("Query_Ligand").head(1))

In [None]:
grid.display(size=(800,800))
grid.save(figure_path / "failed_ligs.html")

In [None]:
failed_lig_df.to_csv("analyzed_data/failed_ligs.csv")

# Are these ligands from particularly early or late in the discovery process?

In [None]:
all_lig_df = df.groupby("Query_Ligand").head(1)

In [None]:
fig = px.histogram(all_lig_df, 
                   x="Query_Structure_Date",
                   height=400,
                   width=600,
                   labels={"Query_Structure_Date": "<b> Structure Deposition Date <b>"},
                   template="plotly_white",
                   range_x=["2021-02-01", "2022-03-01"],
                   nbins=14)
fig.update_yaxes(title_text="<b> Number of Ligands <b>")
fig.show()
fig.write_image(figure_path / "ligand_structure_dates.png")

In [None]:
fig = px.histogram(failed_lig_df.groupby("Query_Ligand").head(1), 
                   x="Query_Structure_Date",
                   height=400,
                   width=600,
                   labels={"Query_Structure_Date": "<b> Structure Deposition Date <b>"},
                   template="plotly_white",
                   range_x=["2021-02-01", "2022-03-01"],
                   nbins=14)
fig.update_yaxes(title_text="<b> Number of Ligands <b>")
fig.show()
fig.write_image(figure_path / "failed_ligand_structure_dates.png")

In [None]:
structures = failed_lig_df.groupby("Query_Ligand").head(1).Query_Structure

In [None]:
with open("load_failed_lig_structures.txt", "w") as f:
    for structure in structures:
        for suffix in ["sdf", "pdb"]:
            f.write(f"loadall {structure}*/*.{suffix}\n")