# Imports

In [None]:
import pandas as pd
import json
from datetime import datetime
import plotly.express as px
from pathlib import Path
import sys
from matplotlib.pyplot import ScalarFormatter
from asapdiscovery.data.readers.molfile import MolFileFactory
from harbor.analysis.cross_docking import DockingDataModel
import harbor.analysis.cross_docking as cd
from importlib import reload
import seaborn as sns
from matplotlib import pyplot as plt
reload(cd)

In [None]:
data = cd.DockingDataModel.deserialize("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/full_cross_dock_v2_combined_results/ALL_1_poses.parquet")

# ASAP protocol
For each n_ref in date_split or random_split:
- for each n_most_similar in [1, 2, 5, 10]
  - we pick the n_most_similar that have the highest similarity to the Query_Ligand
  - rank with RMSD or POSIT Probability


In [None]:
n_refs = cd.generate_logarithmic_scale(403)

In [None]:
n_most_similars = [1, 2, 5, 10]
ref_structure_column = "Reference_Structure"

In [None]:
dataset_splits = [] 
for n_ref in n_refs:
    dataset_splits.append(
                cd.DateSplit(
                    date_column="RefData_Date",
                    randomize_by_n_days=1,
                    n_reference_structures=n_ref,
                    reference_structure_column=ref_structure_column,
                )
            )
    dataset_splits.append(
        cd.RandomSplit(
            reference_structure_column=ref_structure_column,
            n_reference_structures=n_ref,
        ))

In [None]:
similarity_splits = []
for n_most_similar in n_most_similars:
    similarity_splits.append(cd.SimilaritySplit(
                                                groupby={"MCSData_Type": "MCS"},
                                                similarity_column="MCSData_Tanimoto", 
                                                query_ligand_column="Query_Ligand",
                                                reference_ligand_column="RefData_Ligand",
                                                include_similar=True,
                                                higher_is_more_similar=True,
                                                sort_instead_of_threshold=True,
                                                split_level=1,
                                                n_similar=n_most_similar,))

In [None]:
scorers = [
        cd.POSITScorer(variable="PoseData_docking-confidence-POSIT"),
        cd.RMSDScorer(variable="PoseData_RMSD", cutoff=2),
    ]

In [None]:
evs = []
for scorer in scorers:
    for dataset_split in dataset_splits:
        for similarity_split in similarity_splits:
            # need to do this because N_Reference_Structures is getting overwritten and I don't want to go back through the code and change it yet
            # Just setting it to the n_reference_structures of the dataset_split without copying the similarity split does annoying python pointer things (it ends up setting all the ref splits to the same value
            similarity_split = similarity_split.model_copy()
            similarity_split.n_reference_structures = dataset_split.n_reference_structures
            ev = cd.Evaluator(
                pose_selector=cd.PoseSelector(name="PoseSelector", variable="Pose_ID", ascending=True, number_to_return=1),
                dataset_split=dataset_split,
                similarity_split=similarity_split,
                dataset_before_similarity=True,
                scorer=scorer,
                evaluator=cd.BinaryEvaluation(variable="PoseData_RMSD", cutoff=2),
                n_bootstraps=3,
            )
            evs.append(ev)

In [None]:
len(evs)

In [None]:
evs[16]

In [None]:
results = cd.Results.calculate_results(data, evs, n_cpus=1)

In [None]:
results_df = cd.Results.df_from_results(results)

In [None]:
results_df

In [None]:
plot_df = results_df.copy()
plot_df = plot_df[plot_df["Reference_Split"] == "DateSplit"]

In [None]:
plot_df

In [None]:
sns.lineplot(plot_df, x="N_Reference_Structures", y="Fraction", hue="N_Similar", style="Score")