# Imports

In [None]:
from pathlib import Path
from asapdiscovery.data.openeye import load_openeye_sdfs, oechem, save_openeye_sdfs
from asapdiscovery.data.fragalysis import parse_fragalysis
import numpy as np
import pandas as pd, numpy as np
import plotly.express as px
from datetime import datetime
from tqdm.notebook import tqdm
from asapdiscovery.docking.analysis import DockingResults
from importlib import reload

# Load Paths

In [None]:
import sys
sys.path.append(str(Path("../../../").resolve()))
import software.paths as p

In [None]:
reload(p)

In [None]:
paths = p.paths

In [None]:
paths.sars_hybrid.exists()

## loadall csvs

In [None]:
csvs = paths.sars_hybrid.glob("run_docking_oe.*-results.csv")

In [None]:
print("start", datetime.now())
df = pd.concat(map(pd.read_csv, csvs), ignore_index= True)
print("end", datetime.now())

In [None]:
len(df)

## save initial combined csv

In [None]:
combined_csv_path = paths.sars_hybrid / "run_docking_oe-results.csv"

In [None]:
df.to_csv(combined_csv_path, index=False)

## Load csv

In [None]:
dr = DockingResults(csv_path=str(combined_csv_path), column_names="None")

In [None]:
len(dr.df)

## only 17435 results were collected

## Clean Up CSV

In [None]:
dr.df.columns

In [None]:
dr.df.columns = ["Compound_ID", 
                 "Structure_Source", 
                 "Docked_File", 
                 "Pose_ID", 
                 "RMSD", 
                 "POSIT", 
                 "POSIT_Method", 
                 "Chemgauss4", 
                 "Clash", 
                 "SMILES", 
                 "GAT_Score",
                "SCHNET_score"]

In [None]:
dr.df["POSIT_R"] = 1-dr.df["POSIT"]
dr.df["Complex_ID"] = dr.df.Compound_ID.apply(str) + "_" + dr.df.Structure_Source

In [None]:
dr.df

In [None]:
dr.df.POSIT_Method.unique()

# Add Compound Info

In [None]:
from asapdiscovery.data.utils import get_compound_id_xtal_dicts

In [None]:
with open("/Users/alexpayne/Scientific_Projects/covid-moonshot-ml/metadata/cmpd_to_frag.yaml") as f:
    cmpd_to_frag_dict = yaml.safe_load(f)

In [None]:
dataset_df = pd.DataFrame({"Compound_ID": cmpd_to_frag_dict.keys(), 
                           "Compound_Source": [source for source in cmpd_to_frag_dict.values()]})

In [None]:
dataset_df

In [None]:
dr.df = pd.merge(dr.df, dataset_df, on="Compound_ID")

In [None]:
dr.df.head()

## Add Reference Compound

In [None]:
frag_to_cmpd_dict = {v:k for k,v in cmpd_to_frag_dict.items()}

In [None]:
reference_df = pd.DataFrame({"Compound_ID": cmpd_to_frag_dict.keys(), 
                           "Structure_Source": [source for source in cmpd_to_frag_dict.values()]})

In [None]:
Structure_Source = dr.df.Structure_Source.apply(lambda x: x.split("_")[0])

In [None]:
ref_lig = Structure_Source.apply(lambda x: frag_to_cmpd_dict[x])

In [None]:
dr.df["Reference_Ligand"] = ref_lig
dr.df["Structure_Name"] = Structure_Source

# Add Dates

In [None]:
mpro_soaks = paths.fragalysis / "extra_files/Mpro_soaks.csv"

In [None]:
mpro_soaks.exists()

In [None]:
date_df = pd.read_csv(mpro_soaks)

In [None]:
ddf = date_df.loc[:, ["Sample Name", "Data Collection Date"]]

In [None]:
ddf.head()

In [None]:
def date_processor(date_string):
    if type(date_string) == str and not date_string == 'None':
        try:
            return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").date()
        except ValueError:
            return datetime.strptime(date_string, "%d/%m/%Y %H:%M").date()
    else:
        return None

In [None]:
to_merge = pd.DataFrame({"Structure_Name":ddf["Sample Name"], "Structure_Date":ddf["Data Collection Date"].apply(date_processor)})

In [None]:
merged = pd.merge(dr.df, to_merge, on="Structure_Name")

In [None]:
merged.to_csv(combined_csv_path.parent / "results_cleaned.csv")