# Imports

In [None]:
from pathlib import Path
from asapdiscovery.data.openeye import load_openeye_sdfs, oechem, save_openeye_sdfs
from asapdiscovery.data.fragalysis import parse_fragalysis
import numpy as np
import pandas as pd, numpy as np
import plotly.express as px
from datetime import datetime
from tqdm.notebook import tqdm
from asapdiscovery.docking.analysis import DockingResults
import asapdiscovery.data.openeye as oe
import asapdiscovery.modeling.cheminformatics as ci
from importlib import reload

# Load Paths

In [None]:
import sys
sys.path.append(str(Path("../../../").resolve()))
from software.paths import paths

In [None]:
local_analysis = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/")

In [None]:
df = pd.read_csv(local_analysis / "20230610-combined.csv", index_col=0)

In [None]:
np.shape(df)

In [None]:
df.groupby(["Version"]).apply(lambda x: len(x.Complex_ID.unique()))

In [None]:
test = df[df.TanimotoCombo <= 2]

In [None]:
selected = test.sort_values("POSIT_R").groupby(["Compound_ID", "Version"]).head(1)

In [None]:
selected.groupby("Version").apply(lambda x: x.RMSD.apply(lambda x: x<=2).sum()) / (len(selected) / 2)

### a problem here is that this analysis is flawed as I have thrown out missing or failed values

In [None]:
def calculate_true_positives(df):
    perc_good_list = []
    for i in range(0,221):
        try:
            true_positive = df.sort_values("RMSD").groupby(["Compound_ID", "Version"]).head(i)
            perc_good_list.append(true_positive.groupby("Version").apply(lambda x: x.RMSD.apply(lambda x: x<=2).sum()) / (len(true_positive) / 2))
        except Exception as e:
            print(e)
    return perc_good_list

In [None]:
perc_good = calculate_true_positives(df)

In [None]:
len(perc_good)

## so I *could* do it this way. but i'd rather not

# TC analysis

In [None]:
def calculate_perc_good(df, tc_scores, cutoffs:list=[2.0]):
    sorted_df = df.sort_values(["POSIT"], ascending=[False])
    perc_good = []
    n_selected = []
    cutoff_list = []
    tc_list = []
    n_structures = []
    n_final_reference=[]
    for cutoff in tqdm(cutoffs):
        for tc_score in tc_scores:
            selected = sorted_df[sorted_df.TanimotoCombo <= tc_score]
            top_posit_score = selected.groupby(["Compound_ID", "Version"]).head(1)
            n_selected.append(len(top_posit_score))
            perc_good.append(top_posit_score.RMSD.apply(lambda x: x <=cutoff).sum() / len(top_posit_score))
            cutoff_list.append(cutoff)
            tc_list.append(tc_score)
            n_structures.append(len(selected.Structure_Source.unique()))
            n_final_reference.append(len(top_posit_score.Structure_Source.unique()))
    df = pd.DataFrame({"TanimotoCombo": tc_list, "Cutoff (Å)": cutoff_list, "Percentage": perc_good, "Number of Reference Structures": n_structures, "Number of Structures Used in Best Pose": n_final_reference})
    return df

In [None]:
plotdata = calculate_perc_good(df, tc_scores=np.linspace(0,2.0,50), cutoffs=[2])

In [None]:
plotdata["Cutoff (Å)"] = plotdata["Cutoff (Å)"].astype(str)

In [None]:
fig = px.scatter(plotdata, x="TanimotoCombo", 
                 y="Percentage", 
                 color="Version",
                 category_orders={"Cutoff (Å)":["2.0","1.5", "1.0", ]},
                 height=600, 
                 width=600,)
fig.update_xaxes(title="TanimotoCombo Score of Query to Reference Molecule", range=[0,2.1])
fig.update_yaxes(title="Fraction of Posed Molecules with RMSD to Crystal Structure < Cutoff", range=[0,1])
fig.show()