# Imports

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import seaborn as sns
import pandas as pd
from pathlib import Path
from harbor.analysis import cross_docking as cd

# Load Data

In [None]:
results_path = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/full_cross_dock_v2_analyzed_results/")
posit_results = results_path / "ALL_1_poses_ecfp4_combined_results.csv"
df = pd.read_csv(posit_results)

In [None]:
pose_data_path = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/full_cross_dock_v2_combined_results/ALL_1_poses.parquet")
pose_data = cd.DockingDataModel.deserialize(pose_data_path) 

In [None]:
pose_data.dataframe

In [None]:
ref_structure_column = "Reference_Structure"
sim_split = cd.EvaluatorFactory(name="ecfp_split")
sim_split.success_rate_evaluator_settings.use = True
sim_split.success_rate_evaluator_settings.success_rate_column = "PoseData_RMSD"

sim_split.scorer_settings.rmsd_scorer_settings.use = True
sim_split.scorer_settings.rmsd_scorer_settings.rmsd_column_name = "PoseData_RMSD"

sim_split.scorer_settings.posit_scorer_settings.use = True
sim_split.scorer_settings.posit_scorer_settings.posit_score_column_name = (
    "PoseData_docking-confidence-POSIT"
)
sim_split.name = "increasing_similarity_ecfp4"
sim_split.pairwise_split_settings.use = True
sim_split.pairwise_split_settings.similarity_split_settings.use = True
sim_split.pairwise_split_settings.similarity_split_settings.include_similar = False
sim_split.pairwise_split_settings.similarity_split_settings.similarity_column_name = (
    "ECFPData_Tanimoto"
)
sim_split.pairwise_split_settings.similarity_split_settings.similarity_groupby_dict = {
    "ECFPData_fingerprint": "ECFP4_2048"
}
sim_split.pairwise_split_settings.similarity_split_settings.update_reference_settings.use = (
    True
)
sim_split.pairwise_split_settings.similarity_split_settings.update_reference_settings.use_logarithmic_scaling = (
    True
)
sim_split.n_bootstraps = 10

In [None]:
evs = sim_split.create_evaluators(pose_data)

In [None]:
len(evs)

In [None]:
evs[50]

In [None]:
results = cd.Results.calculate_results(pose_data, evs[50:51])

In [None]:
results_df = cd.Results.df_from_results(results)

In [None]:
ev = evs[50]

In [None]:
d1 = ev.run_pose_selector([pose_data])

In [None]:
d2 = ev.run_similarity_split(d1)

In [None]:
ss = ev.similarity_split
ss_dict = ss.model_dump()

In [None]:
ss_dict

In [None]:
from importlib import reload
reload(cd)
ss = cd.SimilaritySplit(**ss_dict)
ss.run(pose_data)

In [None]:
data = pose_data.model_copy()
self = ss
bootstraps = 1
df = data.dataframe

In [None]:
# first just get the necessary data
for key, value in self.groupby.items():
    df = df[df[key] == value]

In [None]:
ss.groupby

In [None]:
data.dataframe.columns

In [None]:
data.dataframe

In [ ]:
# if include similar True and higher is MORE similar, or if similar False and higher is LESS similar
if self.include_similar == self.higher_is_more_similar:
    df = df[df[self.similarity_column] >= self.threshold]

# if include similar True and higher is LESS similar, or if similar False and higher is MORE similar
elif self.include_similar != self.higher_is_more_similar:
    df = df[df[self.similarity_column] <= self.threshold]

if self.n_reference_structures is None:
    filtered = [cd.DockingDataModel(dataframe=df, **data.model_dump())]
else:
    filtered = [
        cd.DockingDataModel(
            dataframe=(
                df.groupby(self.query_ligand_column)
                .apply(
                    lambda x: (
                        x
                        if len(x) <= self.n_reference_structures
                        else x.sample(n=self.n_reference_structures)
                    )
                )
                .reset_index(drop=True)
            ),
            **data.model_dump(),
        )
        for _ in range(bootstraps)
    ]