# Imports

In [None]:
from pathlib import Path
from asapdiscovery.data.openeye import load_openeye_sdfs, oechem, save_openeye_sdfs
from asapdiscovery.data.fragalysis import parse_fragalysis
import numpy as np
import pandas as pd, numpy as np
import plotly.express as px
from datetime import datetime
from tqdm.notebook import tqdm
from asapdiscovery.docking.analysis import DockingResults
import asapdiscovery.data.openeye as oe
import asapdiscovery.modeling.cheminformatics as ci
from importlib import reload

# Load Paths

In [None]:
import sys
sys.path.append(str(Path("../../../").resolve()))
from software.paths import paths

In [None]:
local_analysis = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/")

In [None]:
df = pd.read_csv(local_analysis / "20230611-combined.csv", index_col=0)

In [None]:
np.shape(df)

In [None]:
df.groupby(["Version"]).apply(lambda x: len(x.Complex_ID.unique()))

In [None]:
df.nunique()

In [None]:
cutoff = 1

In [None]:
df["TanimotoCombo_R"] = 2-df.TanimotoCombo

In [None]:
test = df[df.TanimotoCombo <= cutoff]

In [None]:
selected = test.sort_values("RMSD").groupby(["Compound_ID", "Version"]).head(1)

In [None]:
group = selected.groupby("Version", group_keys=True)

In [None]:
perc_good = group["RMSD"].apply(lambda x: x <= 2).groupby(["Version"]).sum() / group["RMSD"].count()

In [None]:
versions = []

In [None]:
perc_good_list = []

In [None]:
for version in perc_good.index:
    versions.append(version)
    perc_good_list.append(perc_good[version])

In [None]:
perc_good_list

In [None]:
def get_selected(df, cutoff_column, cutoff, sort_column, n, selection_cols=("Compound_ID", "Version")):
    return df[df[cutoff_column] <= cutoff].sort_values(sort_column).groupby(list(selection_cols)).head(n)

In [None]:
selected = get_selected(df, "TanimotoCombo", 2, "RMSD", 1, selection_cols=["Compound_ID", "Version", "POSIT_Method"])

In [None]:
def calc_perc_good(df, score_column, good_score, total_mol, split_cols=("Version")):
    group = df.groupby(split_cols, group_keys=True)
    return group[score_column].apply(lambda x: x <= good_score).groupby(split_cols).sum() / total_mol
    

In [None]:
total_mol = selected.groupby(["Version", "POSIT_Method"])["Compound_ID"].nunique()

In [None]:
calc_perc_good(selected, "RMSD", 2, split_cols=["Version", "POSIT_Method"], total_mol=total_mol)

In [None]:
fred_df = df[df.POSIT_Method == "FRED"]

In [None]:
fred_df.nunique()

In [None]:
def calculate_perc_good(df, cutoff_column, cutoffs: list, sort_column, n, selection_cols, score_column:str, good_score, split_cols, use_per_split_mol=False):
    total_mols = len(df.Compound_ID.unique())
    version_list = []
    score_list = []
    cutoff_list = []
    perc_mols_list = []
    for cutoff in cutoffs:
        selected = get_selected(df, cutoff_column, cutoff, sort_column, n, selection_cols)
        
        if use_per_split_mol:
            total_mols = df.groupby(split_cols)["Compound_ID"].apply(lambda x: len(x.unique()))
        
        perc_mols = selected.groupby(split_cols).nunique()["Compound_ID"] / total_mols
        score_array = calc_perc_good(selected, score_column, good_score, total_mols, split_cols)
        
        for version in score_array.index:
            version_list.append(version)
            score_list.append(score_array[version])
            cutoff_list.append(cutoff)
            perc_mols_list.append(perc_mols[version])
    
    return_df = pd.DataFrame({f"Percentage": score_list, cutoff_column:cutoff_list, "Version":version_list, "Percentage Docked": perc_mols_list})
    return return_df

In [None]:
true_positive = calculate_perc_good(df,
                    cutoff_column="TanimotoCombo",
                    cutoffs=np.linspace(0,2,50),
                    sort_column="RMSD",
                    n=1,
                    selection_cols=["Compound_ID", "Version"],
                    score_column="RMSD",
                    good_score=2,
                    split_cols=["Version"])

In [None]:
fig = px.scatter(true_positive, x="TanimotoCombo", 
                 y="Percentage", 
                 color="Version",
                 height=600, 
                 width=600,)
fig.show()

In [None]:
sorted_by_posit = calculate_perc_good(df,
                    cutoff_column="TanimotoCombo",
                    cutoffs=np.linspace(0,2,50),
                    sort_column="POSIT_R",
                    n=1,
                    selection_cols=["Compound_ID", "Version"],
                    score_column="RMSD",
                    good_score=2,
                    split_cols=["Version"])

In [None]:
fig = px.scatter(sorted_by_posit, x="TanimotoCombo", 
                 y="Percentage", 
                 color="Version",
                 height=600, 
                 width=600,)
fig.show()

In [None]:
dfs = []
for sort_col in ["RMSD", "POSIT_R", "Chemgauss4", "TanimotoCombo_R"]:
    new_df = calculate_perc_good(df,
                cutoff_column="TanimotoCombo",
                cutoffs=np.linspace(0,2,50),
                sort_column=sort_col,
                n=1,
                selection_cols=["Compound_ID", "Version"],
                score_column="RMSD",
                good_score=2,
                split_cols=["Version"])
    new_df["Sorted_By"] = sort_col
    dfs.append(new_df)
combined = pd.concat(dfs)

In [None]:
fig = px.scatter(combined, x="TanimotoCombo", 
                 y="Percentage", 
                 color="Version",
                 facet_col="Sorted_By",
                 hover_data=combined.columns,
                 height=600, 
                 width=1200,)
fig.show()

In [None]:
dfs = []
for sort_col in ["RMSD", "POSIT_R", "Chemgauss4", "TanimotoCombo_R"]:
    new_df = calculate_perc_good(df,
                cutoff_column="TanimotoCombo",
                cutoffs=np.linspace(0,2,50),
                sort_column=sort_col,
                n=1,
                selection_cols=["Compound_ID", "Version", "POSIT_Method"],
                score_column="RMSD",
                good_score=2,
                split_cols=["Version", "POSIT_Method"])
    new_df["Sorted_By"] = sort_col
    dfs.append(new_df)
combined = pd.concat(dfs)

In [None]:
fig = px.scatter(combined, x="TanimotoCombo", 
                 y="Percentage", 
                 color="Version",
                 facet_col="Sorted_By",
                 hover_data=combined.columns,
                 height=600, 
                 width=1200,)
fig.show()

In [None]:
dates = df.Structure_Date.unique()
dates.sort()
dfs = []
for sort_col in ["RMSD", "POSIT_R", "Chemgauss4", "TanimotoCombo_R"]:
    new_df = calculate_perc_good(df,
                cutoff_column="Structure_Date",
                cutoffs=dates,
                sort_column=sort_col,
                n=1,
                selection_cols=["Compound_ID", "Version"],
                score_column="RMSD",
                good_score=2,
                split_cols=["Version"])
    new_df["Sorted_By"] = sort_col
    dfs.append(new_df)
combined = pd.concat(dfs)

In [None]:
fig = px.scatter(combined, x="Structure_Date", 
                 y="Percentage", 
                 color="Version",
                 facet_col="Sorted_By",
                 hover_data=combined.columns,
                 height=600, 
                 width=1200,)
fig.show()

In [None]:
dates = df.Structure_Date.unique()
dates.sort()
dfs = []
for sort_col in ["RMSD", "POSIT_R", "Chemgauss4", "TanimotoCombo_R"]:
    new_df = calculate_perc_good(df,
                cutoff_column="Structure_Date",
                cutoffs=dates,
                sort_column=sort_col,
                n=1,
                selection_cols=["Compound_ID", "Version", "POSIT_Method"],
                score_column="RMSD",
                good_score=2,
                split_cols=["Version", "POSIT_Method"],
                                use_per_split_mol=False)
    new_df["Sorted_By"] = sort_col
    dfs.append(new_df)
combined = pd.concat(dfs)

In [None]:
fig = px.scatter(combined, x="Structure_Date", 
                 y="Percentage", 
                 color="Version",
                 facet_col="Sorted_By",
                 hover_data=combined.columns,
                 height=600, 
                 width=1200,)
fig.show()

In [None]:
fig = px.scatter(combined, x="Structure_Date", 
                 y="Percentage Docked", 
                 facet_col="Version",
                 hover_data=combined.columns,
                 height=600, 
                 width=1200,)
fig.update_yaxes(range=[0,1.1])
fig.show()

In [None]:
for method in df.POSIT_Method.unique():
    fig = px.density_heatmap(df[df.POSIT_Method == method],
                     x="RMSD",
                     y="POSIT",
                             marginal_x="histogram", marginal_y="histogram",
                             height=800,
                             width=800,
                             title=method,
                             range_x=[0,11],
                             range_y=[0,1.1]
                            )
    fig.show()
    