# Question

Plotting the mulitpose results shows that as we increase the number of poses included for the randomly split, sorted by RMSD results, they get worse. this shouldn't be possible so i think something must be wrong

# Imports

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from pydantic import BaseModel, Field
import abc
from tqdm import tqdm

## Paths

In [None]:
data_path = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/20240424_multi_pose_docking_cross_docking/")
csvs_path = data_path / ("results_csvs")
result_csv = csvs_path / "20240503_combined_results_with_data.csv"

In [None]:
output_data_path = Path("analyzed_data")

In [None]:
result_csv.exists()

## Load Data

In [None]:
df = pd.read_csv(result_csv, index_col=0)
df["Complex_ID"] = df["Query_Ligand"] + "_" + df["Reference_Structure"]

# only keep first dataset split

In [None]:
dates = df.Reference_Structure_Date.unique()
dates.sort()
earliest = dates[:45]

In [None]:
df_split = df[df.Reference_Structure_Date.isin(earliest)]

In [None]:
df_split.Reference_Structure_Date.nunique()

In [None]:
poses1 = df_split.sort_values("Pose_ID", ascending=True).groupby(["Query_Ligand", "Reference_Ligand"]).head(1)

In [None]:
poses20 = df_split.sort_values("Pose_ID", ascending=True).groupby(["Query_Ligand", "Reference_Ligand"]).head(20)

In [None]:
poses50 = df_split.sort_values("Pose_ID", ascending=True).groupby(["Query_Ligand", "Reference_Ligand"]).head(50)

In [None]:
poses1_scored = poses1.sort_values("RMSD", ascending=True).groupby(["Query_Ligand"]).head(1)

In [None]:
poses20_scored = poses20.sort_values("RMSD", ascending=True).groupby(["Query_Ligand"]).head(1)

In [None]:
poses50_scored = poses50.sort_values("RMSD", ascending=True).groupby(["Query_Ligand"]).head(1)

In [None]:
poses1_scored["RMSD"].apply(lambda x: x <= 2.0).sum() / 205

In [None]:
poses20_scored["RMSD"].apply(lambda x: x <= 2.0).sum() / 205

In [None]:
poses50_scored["RMSD"].apply(lambda x: x <= 2.0).sum() / 205

# what about if we use the posit score?

In [None]:
poses1_scored_posit = poses1.sort_values("docking-confidence-POSIT", ascending=False).groupby(["Query_Ligand"]).head(1)

In [None]:
poses20_scored_posit = poses20.sort_values("docking-confidence-POSIT", ascending=False).groupby(["Query_Ligand"]).head(1)

In [None]:
poses50_scored_posit = poses50.sort_values("docking-confidence-POSIT", ascending=False).groupby(["Query_Ligand"]).head(1)

In [None]:
poses1_scored_posit["RMSD"].apply(lambda x: x <= 2.0).sum() / 205

In [None]:
poses20_scored_posit["RMSD"].apply(lambda x: x <= 2.0).sum() / 205

In [None]:
poses50_scored_posit["RMSD"].apply(lambda x: x <= 2.0).sum() / 205

# why is it getting worse?

In [None]:
poses50_scored_posit.sort_values(["RMSD"])

In [None]:
poses1_scored_posit.sort_values(["RMSD"])

In [None]:
failed1 = poses1_scored_posit[poses1_scored_posit.RMSD > 2.0].Query_Ligand

In [None]:
failed50 = poses50_scored_posit[poses50_scored_posit.RMSD > 2.0].Query_Ligand

In [None]:
set(failed1) - set(failed50)

In [None]:
set(failed50) - set(failed1)

In [None]:
example1 = poses1[poses1.Query_Ligand == "EDG-MED-b1ef7fe3-1"]

In [None]:
example50 = poses50[poses50.Query_Ligand == "EDG-MED-b1ef7fe3-1"]

In [None]:
example1

In [None]:
example50

# Why are these so different?

In [None]:
import plotly.express as px
from software.plotting import plot_scatter_with_regression_line_plotly
plot_scatter_with_regression_line_plotly(poses1_scored["RMSD"], poses1_scored["docking-confidence-POSIT"])

In [None]:
fig = plot_scatter_with_regression_line_plotly(poses1_scored_posit["RMSD"], poses1_scored_posit["docking-confidence-POSIT"])
fig.update_layout(height=400, width=600)

In [None]:
px.density_contour(poses1_scored, 
                   x="RMSD", y="docking-confidence-POSIT", 
                   marginal_x="histogram", 
                   marginal_y="histogram", 
                   range_x=[0, 5], 
                   range_y=[0, 1],
                   template="plotly_white",
                   height=600,
                   width=800)

In [None]:
px.density_contour(poses1_scored_posit, 
                   x="RMSD", y="docking-confidence-POSIT", 
                   marginal_x="histogram", 
                   marginal_y="histogram", 
                   range_x=[0, 5], 
                   range_y=[0, 1],
                   template="plotly_white",
                   height=600,
                   width=800)

In [None]:
px.density_contour(poses20_scored_posit, 
                   x="RMSD", y="docking-confidence-POSIT", 
                   marginal_x="histogram", 
                   marginal_y="histogram", 
                   range_x=[0, 5], 
                   range_y=[0, 1],
                   template="plotly_white",
                   height=600,
                   width=800)

In [None]:
px.density_contour(poses50_scored_posit, 
                   x="RMSD", y="docking-confidence-POSIT", 
                   marginal_x="histogram", 
                   marginal_y="histogram", 
                   range_x=[0, 5], 
                   range_y=[0, 1],
                   template="plotly_white",
                   height=600,
                   width=800)

In [None]:
fig = plot_scatter_with_regression_line_plotly(poses50_scored["Pose_ID"], poses50_scored["RMSD"])
fig.update_layout(height=400, width=600, template="plotly_white")
fig.update_xaxes(title_text="Pose ID")
fig.update_yaxes(title_text="RMSD")

# maybe a way to show this is to count the number of complexes where a pose after the first has a lower RMSD?

## I'd like to do this for the full dataset

In [None]:
poses1 = df.sort_values("Pose_ID", ascending=True).groupby(["Query_Ligand", "Reference_Structure"]).head(1)

In [None]:
poses_not_1 = df[df.Pose_ID != 0]

In [None]:
poses_not_1_best = poses_not_1.sort_values(["RMSD"]).groupby(["Query_Ligand", "Reference_Structure"]).head(1)

In [None]:
poses_not_1_best.nunique()

In [None]:
merged = poses1.merge(poses_not_1_best, on="Complex_ID", suffixes=("_1", "_not_1"), how="inner")

In [None]:
len(merged)

In [None]:
merged["dRMSD"] = merged["RMSD_not_1"] - merged["RMSD_1"]

In [None]:
sum(merged["dRMSD"] < 0)

In [None]:
sum(merged["dRMSD"] > 0)

In [None]:
rmsd = np.sqrt(sum((merged["dRMSD"])**2))

In [None]:
rmsd

In [None]:
px.histogram(merged["dRMSD"], height=400, width=600, template="plotly_white")

In [None]:
merged_big = poses1.merge(poses_not_1, on="Complex_ID", suffixes=("_1", "_not_1"), how="inner")
merged_big["dRMSD"] = merged_big["RMSD_not_1"] - merged_big["RMSD_1"]

In [None]:
px.histogram(merged_big["dRMSD"], height=400, width=600, template="plotly_white")

# Reference Structure Mpro-P0097 is causing problems again

In [None]:
mpro_p0097 = merged_big[merged_big.Reference_Structure_1 == "Mpro-P0097_0A"]

In [None]:
mpro_p0097