# Imports

In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from pathlib import Path
from datetime import datetime

In [None]:
# todays date as YYYYMMDD
today = datetime.today().strftime('%Y%m%d')

In [None]:
fred_results = pd.read_csv(Path.cwd() / 'analyzed_data' / 'results.csv')

In [None]:
fred_results["Error_Lower"] = fred_results["Fraction"] - fred_results["CI_Lower"]
fred_results["Error_Upper"] = fred_results["CI_Upper"] - fred_results["Fraction"]

In [None]:
df = fred_results.copy()

In [None]:
figure_path = Path.cwd() / "figures"
figure_path.mkdir(exist_ok=True)

# Plotting Variables

In [None]:
large_font = 24
small_font = 18 
labels = {"Fraction": "<b> Fraction of Poses Docked < 2Å from Reference </b>",
               "N_Per_Split": "<b> Total Number of Reference Structures Available to Use </b>",
          }
update_layout_dict = dict(xaxis=dict(title_font=dict(size=large_font), 
                            color='black', 
                            ),
                  yaxis=dict(range=(0,1),  
                      title_font=dict(size=large_font), 
                             color='black', 
                             ))

In [None]:
def update_traces(fig):
    for trace in fig.data:
        trace.name = trace.name.replace("_", " ")
        trace.name = trace.name.replace("Split", "")
        trace.name = trace.name.replace(", ", " - ")
        trace.name = trace.name.replace("RMSD", "RMSD (Positive Control)")
    return fig

In [None]:
def hex_to_rgb(hex_color: str) -> tuple:
    hex_color = hex_color.lstrip("#")
    if len(hex_color) == 3:
        hex_color = hex_color * 2
    return int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)

In [None]:
def rgb_to_rgba(rgb_str, alpha):
    # Split the RGB string into its components
    rgb_values = rgb_str.strip('rgb()').split(',')
    
    # Extract individual RGB values and convert them to integers
    r, g, b = map(int, rgb_values)
    
    # Construct the RGBA string
    rgba_str = f"rgba({r}, {g}, {b}, {alpha})"
    
    return rgba_str

# Figure 1: Datesplit vs Random for POSIT_Probability vs RMSD

## dataset mangling

In [None]:
dataset_split_df = df[
    # (df.StructureChoice == "Dock_to_All")&
((df.StructureChoice_Choose_N == "10")|(df.StructureChoice_Choose_N == "All"))&
    (df.PoseSelection_Choose_N == 1)
]

In [None]:
dataset_split_df

In [None]:
fig = px.line(dataset_split_df, 
              x="N_Per_Split", 
              y="Fraction", 
              color="Score",
              line_dash="Split",
              # error_y="Error_Upper", 
              # error_y_minus="Error_Lower", 
              template="simple_white", 
              # symbol="Score", 
              height=1200, 
              width=800,
              facet_row="StructureChoice",
              log_x=True,
              color_discrete_sequence=px.colors.qualitative.Dark2,
              labels = labels
              )
fig.update_layout(
    font=dict(size=small_font, 
              family='Arial'
              ),
    legend=dict(title="<b> Score Function, Dataset Split </b>", 
                              x=0.4, y=0.1, 
                              # traceorder='reversed', 
                              title_font_size=large_font, 
                              font_color='black'),
    **update_layout_dict)
fig.update_yaxes(tickvals=np.arange(0, 1.1, 0.1)) 
fig = update_traces(fig)
fig.show()
fig.write_image(figure_path / f"{today}_dataset_split_comparison_v3.png")

# I don't believe this...

In [None]:
raw_df = pd.read_csv(Path.cwd() / 'results_processing' / '20241205_combined_results_with_data.csv')

In [None]:
structures = raw_df.groupby("Reference_Structure").head(1).sort_values(["Reference_Structure_Date"]).head(10)["Reference_Structure"].tolist()

In [None]:
subset_df = raw_df[raw_df["Reference_Structure"].isin(structures)]
single_pose = subset_df[(subset_df["Pose_ID"] == 0)&(subset_df["Fingerprint"] == "ECFP4")]

In [None]:
results = single_pose.sort_values(["docking-confidence-POSIT"], ascending=False).groupby("Query_Ligand").head(1)

In [None]:
sum(results.RMSD < 2.0) / len(results)

In [None]:
# get random samples of first 10 structures
results_list = []
structures = raw_df.groupby("Reference_Structure").head(1)["Reference_Structure"]
for i in range(100):
    subset_structures = structures.sample(10).tolist()
    subset_df = raw_df[raw_df["Reference_Structure"].isin(subset_structures)]
    single_pose = subset_df[(subset_df["Pose_ID"] == 0)&(subset_df["Fingerprint"] == "ECFP4")]
    results = single_pose.sort_values(["docking-confidence-POSIT"], ascending=False).groupby("Query_Ligand").head(1)
    results_list.append(sum(results.RMSD < 2.0) / len(results))

In [None]:
results_list

In [None]:
np.mean(results_list)

In [None]:
np.quantile(results_list, 0.025)

In [None]:
np.quantile(results_list, 0.975)

In [None]:
# get average RMSD of ligands for each structure ordered by date

In [None]:
raw_df["RMSD"] = raw_df["RMSD"].astype(float)

In [None]:
raw_df

In [None]:
min_rmsds = raw_df.groupby(["Reference_Structure", "Query_Ligand"]).min()

In [None]:
min_rmsds = min_rmsds.reset_index()

In [None]:
structure_and_date = raw_df.groupby("Reference_Structure").head(1)[["Reference_Structure", "Reference_Structure_Date"]]

In [None]:
avg_min_rmsd = structure_and_date.merge(min_rmsds, on="Reference_Structure")
avg_min_rmsd.sort_values("Reference_Structure_Date", inplace=True)

In [None]:
px.box(min_rmsds, x="Reference_Structure", y="RMSD")

In [None]:
px.histogram(raw_df[raw_df["Reference_Structure"] == ("Mpro-P0772_0A")], x="RMSD", hover_data=["Query_Ligand"])

In [None]:
raw_df[raw_df["Reference_Ligand"] == "EDG-MED-5d232de5-6"]

In [None]:
min_rmsds[min_rmsds['Reference_Structure']== "Mpro-P0772_0A"]

# Conclusions
Right so in case it isn't obvious, about half of the prepped structures weren't aligned correctly. So I'm going to redo everything starting from prepping.