# Imports

In [None]:
import pandas as pd
import json
from datetime import datetime
import plotly.express as px
from pathlib import Path
import sys
from matplotlib.pyplot import ScalarFormatter
from asapdiscovery.data.readers.molfile import MolFileFactory
from harbor.analysis.cross_docking import DockingDataModel
import harbor.analysis.cross_docking as cd
from importlib import reload
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
reload(cd)

# Load Data

In [None]:
posit_results = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/full_cross_dock_v2_analyzed_results/posit_reverse_similarity_split_combined_results.csv")
pdf = pd.read_csv(posit_results)
pdf["Error_Lower"] = pdf["Fraction"] - pdf["CI_Lower"]
pdf["Error_Lower"] = pdf["Error_Lower"].apply(lambda x: 0 if x < 0 else x)
pdf["Error_Upper"] = pdf["CI_Upper"] - pdf["Fraction"]
pdf["Error_Upper"] = pdf["Error_Upper"].apply(lambda x: 0 if x < 0 else x)

In [None]:
pdf.columns

In [None]:
pdf.sort_values("Fraction")

In [None]:
pdf.sort_values(["N_Reference_Structures", "Score", "Reference_Split"])

In [None]:
pdsr = pd.read_csv("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/full_cross_dock_v2_analyzed_results/ALL_1_poses_datesplit_combined_results.csv")

In [None]:
pdsr["Error_Lower"] = pdsr["Fraction"] - pdsr["CI_Lower"]
pdsr["Error_Lower"] = pdsr["Error_Lower"].apply(lambda x: 0 if x < 0 else x)
pdsr["Error_Upper"] = pdsr["CI_Upper"] - pdsr["Fraction"]
pdsr["Error_Upper"] = pdsr["Error_Upper"].apply(lambda x: 0 if x < 0 else x)

# Plotting Params

In [None]:
# Global configuration
fig_path = Path("./20250804_reverse_similarity_split")
fig_path.mkdir(parents=True, exist_ok=True)

FIGNUM_GLOBAL = 0

# def save_fig(fig, filename, dpi=200, suffix=".pdf"):
#     """Save the figure with a global figure number."""
#     global FIGNUM_GLOBAL
#     FIGNUM_GLOBAL += 1
#     figpath = Path(fig_path / f"{filename}_{FIGNUM_GLOBAL:02d}")
#     fig.savefig(figpath.with_suffix(suffix), 
#                 bbox_inches="tight", 
#                 dpi=dpi)

def save_fig(fig, filename, dpi=200, suffix=".pdf"):
    figpath = Path(fig_path / f"{filename}")
    fig.savefig(figpath.with_suffix(suffix), 
                bbox_inches="tight", 
                dpi=dpi)
    

sns.set_style("white")
label_map = {
    "Reference_Split": "Dataset Split Type",
    "Score": "Scoring Method",
    "RandomSplit": "Randomly Ordered",
    "DateSplit": "Ordered by Date",
    "RMSD": "RMSD (Positive Control)",
    "POSIT_Probability": "POSIT Probability",
    "Similarity_Threshold": "Similarity Threshold",
    "ECFP4_2048": "ECFP4 2048",
    "MCSData_Tanimoto": "MCS Tanimoto",
    "ECFPData_Tanimoto": "Fingerprint Tanimoto",
    "TanimotoComboData_Tanimoto": "Tanimoto Combo (Aligned)",
    "TanimotoCombo_True": "Tanimoto Combo (Aligned)",
    
    # "N_Reference_Structures": "Number of Randomly Chosen Reference Structures",
    "N_Reference_Structures": "Number of Reference Structures Available to Use \n(Log Scale)",
    "Fraction": "Fraction of Ligands Posed \n<2Å from Reference",
    "CI_Lower": "Confidence Interval Lower Bound",
    "CI_Upper": "Confidence Interval Upper Bound",
    "N_Similar": "Number of Similar Structures To Select",
    
}
for column in pdf.columns:
    if not column in label_map:
        label_map[column] = column
        
X_VAR = label_map["N_Reference_Structures"]
Y_VAR = label_map["Fraction"]
X_LABEL = label_map["N_Reference_Structures"]
Y_LABEL = label_map["Fraction"]
# QUERY_SCAFFOLD_ID = label_map["Query_Scaffold_ID_Subset_1"]
# REF_SCAFFOLD_ID = label_map["Reference_Scaffold_ID_Subset_1"]
COLOR_VAR = label_map["Reference_Split"]
STYLE_VAR = label_map["Score"]
CI_LOWER = label_map["CI_Lower"]
CI_UPPER = label_map["CI_Upper"]
LARGE_FIG_SIZE = (12, 8)
SMALL_FIG_SIZE = (8, 6)
FONT_SIZES = {
    "xlabel": 24,
    "ylabel": 24,
    "ticks": 18,
    "legend_title": 24,
    "legend_text": 18,
}
ALPHA = 0.1

## update labels

In [None]:
df = pdf.copy()

In [None]:
df = df.rename(columns=label_map)

In [None]:
for column in df.columns:
    df[column] = df[column].apply(lambda x: label_map.get(x,x))

In [None]:
df.columns

In [None]:
pdsr = pdsr.rename(columns=label_map)
for column in pdsr.columns:
    pdsr[column] = pdsr[column].apply(lambda x: label_map.get(x,x))

In [None]:
pdsr = pdsr[(pdsr[label_map["Score"]] == label_map["POSIT_Probability"]) & (pdsr[label_map["Reference_Split"]] == label_map["DateSplit"])]

# Make Figure

In [None]:
def plot_filled_in_error_bars(
    raw_df,
    x_var=X_VAR,
    y_var=Y_VAR,
    color_var=COLOR_VAR,
    style_var=STYLE_VAR,
    ci_lower=CI_LOWER,
    ci_upper=CI_UPPER,
    x_label=X_LABEL,
    y_label=Y_LABEL,
    reverse_hue_order=False,
    reverse_style_order=False,
        x_ticks=None,
        y_ticks=None,
        log_scale=True,
    fill_between=True,
        add_default_line=False,
):
    # Sort the dataframe
    raw_df = raw_df.sort_values(by=[x_var, style_var, color_var])
    plt.figure(figsize=(LARGE_FIG_SIZE[0], LARGE_FIG_SIZE[1]))
    
    # Define hue and style orders
    hue_order = list(reversed(sorted(raw_df[color_var].unique()))) if reverse_hue_order else list(sorted(raw_df[color_var].unique()))
    style_order = list(reversed(sorted(raw_df[style_var].unique()))) if reverse_style_order else list(sorted(raw_df[style_var].unique()))
    
    # Create color mapping
    unique_colors = sns.color_palette(n_colors=len(raw_df[color_var].unique()))
    color_map = dict(zip(hue_order, unique_colors))
    
    
    
    # Create the line plot
    fig = sns.lineplot(
        data=raw_df,
        x=x_var,
        y=y_var,
        hue=color_var,
        style=style_var,  # Keep style_var for line styles
        palette=color_map,
        hue_order=hue_order,
        style_order= style_order,
    )
    
    if add_default_line:
        # add black line for pdsr RMSD DateSplit
        sns.lineplot(pdsr, x=X_VAR, y=Y_VAR, color="black", linestyle="--", label=None)

    if fill_between:
        # Create fill between for each group using matched colors
        for name, group in raw_df.groupby([color_var, style_var]):
            color_name = name[0]  # First element is Score
            fig.fill_between(
                group[x_var],
                group[ci_lower],
                group[ci_upper],
                color=color_map[color_name],
                alpha=ALPHA,
            )
    if log_scale:
        fig.set_xscale("log")
        fig.xaxis.set_major_formatter(ScalarFormatter())
    if x_ticks is not None:
        custom_ticks = x_ticks
    else:
        custom_ticks = np.round(np.linspace(0,1,11), 1)
    fig.set_xticks(custom_ticks)
    fig.set_xticklabels(custom_ticks, fontsize=FONT_SIZES["ticks"])
    
    if y_ticks is not None:
        custom_ticks = y_ticks
    else:
        custom_ticks = np.round(np.linspace(0,1,11), 1)
    fig.set_yticks(custom_ticks)
    fig.set_yticklabels(custom_ticks, fontsize=FONT_SIZES["ticks"])

    fig.set_xlabel(x_label, fontsize=FONT_SIZES["xlabel"], fontweight="bold")
    fig.set_ylabel(y_label, fontsize=FONT_SIZES["ylabel"], fontweight="bold")

    # Customize legend
    legend = fig.legend()
    plt.setp(legend.get_title(), fontsize=FONT_SIZES["legend_title"], fontweight="bold")
    plt.setp(legend.get_texts(), fontsize=FONT_SIZES["legend_text"])
    return plt

In [None]:
df

In [None]:
for sim_type in df[label_map["Similarity_Column"]].unique():
    plot_df = df[(df[label_map["Similarity_Column"]] == sim_type)&(df[label_map["Score"]] == label_map["POSIT_Probability"])&(df[label_map["Reference_Split"]] == label_map["DateSplit"])]
    fig = plot_filled_in_error_bars(
        plot_df,
        x_var=X_VAR,
        y_var=Y_VAR,
        color_var=label_map["N_Similar"],
        # style_var=STYLE_VAR,
        ci_lower=CI_LOWER,
        ci_upper=CI_UPPER,
        x_label=f"{sim_type} Similarity",
        y_label=Y_LABEL,
        x_ticks=[1, 2, 5,10, 20, 30, 40, 50, 100, 137, 200, 300, 403],
        reverse_hue_order=True,
        reverse_style_order=True,
        log_scale=True,
        fill_between=False,
        add_default_line=True
    )
    
    save_fig(fig, filename=f"reverse_similarity_split_{sim_type}_posit_probability")
    
    
    for i in df[label_map["N_Similar"]].unique():
        plot_df = df[
            (df[label_map["Similarity_Column"]] == sim_type) &
            (df[label_map["N_Similar"]] == i)
        ]
        fig = plot_filled_in_error_bars(
            plot_df,
            x_var=X_VAR,
            y_var=Y_VAR,
            color_var=COLOR_VAR,
            style_var=STYLE_VAR,
            ci_lower=CI_LOWER,
            ci_upper=CI_UPPER,
            x_label=f"{sim_type} Similarity (Top {i} Similar)",
            y_label=Y_LABEL,
            x_ticks=[1, 2, 5,10, 20, 30, 40, 50, 100, 137, 200, 300, 403],
            reverse_hue_order=True,
            reverse_style_order=True,
            log_scale=True,
            add_default_line=True
            
        )
        save_fig(
            fig,
            filename=f"reverse_similarity_split_{sim_type}_top_{i}_similar",
        )