In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from pathlib import Path
import harbor.analysis.cross_docking as cd

# Load Data

In [None]:
results_csv = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/20240424_multi_pose_docking_cross_docking/results_csvs/20240503_combined_results_with_data.csv")

In [None]:
data_path = results_csv.parent.parent / "analyzed_data"
figure_path = Path("figures")

In [None]:
df_paths = data_path.glob("*/*.csv")

In [None]:
dfs = [pd.read_csv(path) for path in df_paths]

In [None]:
ogdf = pd.concat(dfs)
ogdf.N_Per_Split = ogdf.N_Per_Split.astype(int)
ogdf.sort_values(["Split", "Score", "PoseSelection", "StructureChoice", "StructureChoice_Choose_N", "N_Per_Split"], inplace=True)

In [None]:
ogdf["Error_Lower"] = ogdf["Fraction"] - ogdf["CI_Lower"]
ogdf["Error_Upper"] = ogdf["CI_Upper"] - ogdf["Fraction"]

In [None]:
raw_df = ogdf[ogdf.PoseSelection == "Default"]

# make plot

In [None]:
results_df = pd.read_csv(results_csv, index_col=0)

In [None]:
results_df.Fingerprint.unique()

In [None]:
results_df

In [None]:
tc_cutoffs = np.linspace(0,1,20)
settings = cd.Settings()

In [None]:
divided_datasets = []
single_pose = results_df[results_df.Pose_ID == 0]

def calculate_sectioned_performance(df, tc_cutoffs, bootstraps=1000):
    results_dicts = []
    for scorer_fxn, ascending in zip(["docking-confidence-POSIT", "RMSD"], [False, True]):
        for lower, upper in zip(tc_cutoffs[:-1], tc_cutoffs[1:]):
            new_df = df[(df.Tanimoto > lower)&(df.Tanimoto <= upper)]
            
            for i in range(bootstraps):
                if i != 0:
                    new_df = new_df.groupby('Query_Ligand').sample(frac=1, replace=True)
                new_df = new_df.sort_values(scorer_fxn, ascending=ascending)
                chosen_poses = new_df.groupby(['Query_Ligand']).head(1)
                out_dict = {'Tanimoto_Lower': lower, 'Tanimoto_Upper': upper,'Total': 0, 'Fraction': 0, 'Dataset_Size':0, 'Scorer': scorer_fxn}
                if len(chosen_poses) != 0:
                    evaluator = cd.BinaryEvaluation(variable='RMSD', cutoff=2.0, below_cutoff_is_good=True)
                    fg = evaluator.run(chosen_poses, groupby='Query_Ligand')
                    out_dict.update({'Total': fg.total, 'Fraction': fg.fraction, 'Dataset_Size': len(new_df)})
                    out_dict.update(chosen_poses.POSIT_Method.value_counts().to_dict())
                results_dicts.append(out_dict)
    return pd.DataFrame.from_records(results_dicts)


In [None]:
df = calculate_sectioned_performance(single_pose, tc_cutoffs, bootstraps=1000)

In [None]:
groupby_cols = ["Tanimoto_Lower", "Scorer"]

In [None]:
means = df.groupby(groupby_cols)["Fraction"].mean().reset_index()

In [None]:
upper = df.groupby(groupby_cols)["Fraction"].quantile(0.975).reset_index().rename_axis(columns={"Fraction": "CI_Upper"})
lower = df.groupby(groupby_cols)["Fraction"].quantile(0.025).reset_index()

In [None]:
upper.columns = groupby_cols + ["CI_Upper"]
lower.columns = groupby_cols + ["CI_Lower"]

In [None]:
merged = means.merge(upper, on=groupby_cols).merge(lower, on=groupby_cols)
merged['Error_Y'] = merged['CI_Upper'] - merged['Fraction']
merged['Error_Y_Minus'] = merged['Fraction'] - merged['CI_Lower']

In [None]:
def calculate_ci_from_bootstrap(df, groupby_cols, main_col) -> pd.DataFrame:
    means = df.groupby(groupby_cols)[main_col].mean().reset_index()
    upper = df.groupby(groupby_cols)[main_col].quantile(0.975).reset_index().rename_axis(columns={main_col: "CI_Upper"})
    lower = df.groupby(groupby_cols)[main_col].quantile(0.025).reset_index()
    upper.columns = groupby_cols + ["CI_Upper"]
    lower.columns = groupby_cols + ["CI_Lower"]
    merged = means.merge(upper, on=groupby_cols).merge(lower, on=groupby_cols)
    merged['Error_Y'] = merged['CI_Upper'] - merged[main_col]
    merged['Error_Y_Minus'] = merged[main_col] - merged['CI_Lower']
    return merged

In [None]:
df

In [None]:
fraction = calculate_ci_from_bootstrap(df, groupby_cols, "Fraction")
dataset_size = calculate_ci_from_bootstrap(df, groupby_cols, "Dataset_Size")
total = calculate_ci_from_bootstrap(df, groupby_cols, "Total")

In [None]:
no_boostraps = calculate_sectioned_performance(single_pose, tc_cutoffs, bootstraps=1)

In [None]:
px.line(no_boostraps, x='Tanimoto_Lower', y='Fraction', template="simple_white", width=600, height=400,color='Scorer')

In [None]:
px.line(merged, x='Tanimoto_Lower', y='Fraction', template="simple_white", width=600, height=400,color='Scorer', error_y="Error_Y", error_y_minus="Error_Y_Minus")

In [None]:
px.violin(df, x='Tanimoto_Lower', y='Fraction', template="simple_white", color='Scorer',)

In [None]:
px.bar(dataset_size[dataset_size['Scorer'] == 'RMSD'], x='Tanimoto_Lower', y='Dataset_Size', template="simple_white", width=600, height=400, barmode='group', log_y=True)

In [None]:
def calc_dataset_size_per_ligand(df, tc_cutoffs, bootstraps=100):
    dfs = []
    for lower, upper in zip(tc_cutoffs[:-1], tc_cutoffs[1:]):
            new_df = df[(df.Tanimoto > lower)&(df.Tanimoto <= upper)]
            new_df = new_df.groupby('Query_Ligand').size().reset_index(name='Dataset_Size')
            new_df['Tanimoto_Lower'] = lower
            new_df['Tanimoto_Upper'] = upper
            dfs.append(new_df)
    return pd.concat(dfs)

In [None]:
dataset_size_per_ligand = calc_dataset_size_per_ligand(single_pose, tc_cutoffs)

In [None]:
means = []
for i in range(100):
    means.append(dataset_size_per_ligand.groupby('Tanimoto_Lower').sample(frac=1, replace=True).groupby('Tanimoto_Lower')['Dataset_Size'].mean())
means_df = pd.concat(means).reset_index()

In [None]:
means_df

In [None]:
size_means = means_df.groupby('Tanimoto_Lower')['Dataset_Size'].mean().reset_index()
size_upper = means_df.groupby('Tanimoto_Lower')['Dataset_Size'].quantile(0.975).reset_index()
size_lower = means_df.groupby('Tanimoto_Lower')['Dataset_Size'].quantile(0.025).reset_index()
size_upper.columns = ['Tanimoto_Lower', "CI_Upper"]
size_lower.columns = ['Tanimoto_Lower', "CI_Lower"]

In [None]:
size_merged = size_means.merge(size_upper, on='Tanimoto_Lower').merge(size_lower, on='Tanimoto_Lower')

In [None]:
size_merged['Error_Y'] = size_merged['CI_Upper'] - size_merged['Dataset_Size']
size_merged['Error_Y_Minus'] = size_merged['Dataset_Size'] - size_merged['CI_Lower']


In [None]:
px.line(size_merged, x='Tanimoto_Lower', y='Dataset_Size', template="simple_white", width=600, height=400, error_y="Error_Y", error_y_minus="Error_Y_Minus")

# Good start. now need to do with the rest