# Imports

In [None]:
from tqdm import tqdm
import pandas as pd
from pathlib import Path
from importlib import reload
import software.analysis as a
reload(a)


In [None]:
local_analysis = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/")

# load full posit results

In [None]:
csv_path = local_analysis / "20230411_full_posit_results/analysis"

In [None]:
csvs = list(csv_path.glob("run_docking_oe.*-results.csv"))

In [None]:
dfs = [pd.read_csv(csv_path) for csv_path in tqdm(csvs)]

In [None]:
df = pd.concat(dfs)

In [None]:
df.to_csv(csv_path / "20240117_01_recombined.csv")

## rename columns

In [None]:
df.columns

In [None]:
df.columns = ["Compound_ID", 
              "Structure_Source", 
              "Docked_File", 
              "Pose_ID", 
              "RMSD", 
              "POSIT", 
              "POSIT_Method",
              "Chemgauss4",
              "Clash",
              "SMILES",
              "GAT_score"]

## add complex id

In [None]:
df["Complex_ID"] = df.Compound_ID + "_" + df.Structure_Source

# Add Compound Info

In [None]:
from asapdiscovery.data.utils import get_compound_id_xtal_dicts
import yaml

In [None]:
with open("/Users/alexpayne/Scientific_Projects/asapdiscovery/asapdiscovery-data/asapdiscovery/data/metadata/cmpd_to_frag.yaml") as f:
    cmpd_to_frag_dict = yaml.safe_load(f)

In [None]:
frag_to_cmpd_dict = {v: k for k,v in cmpd_to_frag_dict.items()}

In [None]:
cmpds_to_remove = [cmpd for dataset, cmpd in frag_to_cmpd_dict.items() if "Mpro-x" in dataset]

In [None]:
duplicates = [
    ('ALP-POS-4483ae88-4', 'MIK-UNK-78dbf1b8-1'),
    ('BEN-BAS-c2bc0d80-6', 'VLA-UCB-50c39ae8-2'),
    ('BEN-DND-f2e727cd-5', 'MAT-POS-3ccb8ef6-1'),
    ('EDG-MED-0e5afe9d-3', 'PET-UNK-29afea89-2'),
    ('EDG-MED-5d232de5-5', 'MIC-UNK-91acba05-6'),
    ('EDG-MED-5d232de5-7', 'EDG-MED-5d232de5-8'),
    ('EDG-MED-5d232de5-7', 'PET-UNK-c9c1e0d8-4'),
    ('EDG-MED-5d232de5-8', 'PET-UNK-c9c1e0d8-4'),
    ('EDJ-MED-015fb6b4-2', 'MAT-POS-a54ce14d-2'),
    ('EDJ-MED-37aac4bd-4', 'MAT-POS-932d1078-3'),
    ('EDJ-MED-8bb691af-4', 'MAT-POS-c7726e07-5'),
    ('EDJ-MED-976a33d5-1', 'MAT-POS-e48723dc-2'),
    ('MAT-POS-090737b9-1', 'VLA-UCB-50c39ae8-7'),
    ('MAT-POS-7174c657-5', 'MAT-POS-7174c657-6'),
    ('MAT-POS-7174c657-5', 'MAT-POS-a13804f0-4'),
    ('MAT-POS-7174c657-6', 'MAT-POS-a13804f0-4'),
    ('MAT-POS-a13804f0-3', 'RAL-THA-05e671eb-10'),
    ('MIK-ENA-5d9157e9-5', 'MIK-ENA-5d9157e9-6'),
    ('VLA-UCB-29506327-1', 'VLA-UNK-cf7facf1-1')
]

## remove x series

In [None]:
p_series = df[df.Structure_Source.str.contains("Mpro-P")]

In [None]:
len(p_series)

In [None]:
p_series.nunique()[["Complex_ID", "Compound_ID", "Structure_Source"]]

## remove the rows containing a ligand from a x series

In [None]:
p_series = p_series[-p_series.Compound_ID.isin(cmpds_to_remove)]

In [None]:
p_series.nunique()[["Complex_ID", "Compound_ID", "Structure_Source"]]

In [None]:
p_series.to_csv(csv_path / "20240117_02_recombined_p_series.csv")

## Add Reference Compound

In [None]:
Structure_Source = p_series.Structure_Source.apply(lambda x: x.split("_")[0])

In [None]:
ref_lig = Structure_Source.apply(lambda x: frag_to_cmpd_dict[x])

In [None]:
p_series["Reference_Ligand"] = ref_lig
p_series["Structure_Name"] = Structure_Source

# Add Dates

In [None]:
from datetime import datetime

In [None]:
mpro_soaks = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/fragalysis_downloads/20230611-mpro/extra_files/Mpro_soaks.csv")

In [None]:
mpro_soaks.exists()

In [None]:
date_df = pd.read_csv(mpro_soaks)

In [None]:
ddf = date_df.loc[:, ["Sample Name", "Data Collection Date"]]

In [None]:
ddf.head()

In [None]:
def date_processor(date_string):
    if type(date_string) == str and not date_string == 'None':
        try:
            return datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").date()
        except ValueError:
            return datetime.strptime(date_string, "%d/%m/%Y %H:%M").date()
    else:
        return None

In [None]:
to_merge = pd.DataFrame({"Structure_Name":ddf["Sample Name"], "Structure_Date":ddf["Data Collection Date"].apply(date_processor)})

In [None]:
merged = pd.merge(p_series, to_merge, on="Structure_Name")

In [None]:
merged.nunique()[["Complex_ID", "Compound_ID", "Structure_Source"]]

In [None]:
merged.to_csv(csv_path / "20240117_03_with_info.csv")

In [None]:
merged.nunique()

# What compound ids are not included in the structures?

In [None]:
missing_cmpd_ids = set(merged.Compound_ID.unique()) - set(merged.Reference_Ligand.unique())

In [None]:
missing_datasets = [cmpd_to_frag_dict.get(cmpd, None) for cmpd in missing_cmpd_ids]

In [None]:
missing_datasets

In [None]:
missing_cmpd_ids

## these cmpd ids are missing
{'ALP-POS-ce760d3f-2',
 'EDG-MED-5d232de5-3',
 'EDG-MED-5d232de5-6',
 'EDG-MED-971238d3-4',
 'EDJ-MED-eff36d94-1',
 'MAT-POS-b3e365b9-4',
 'VLA-UCB-34f3ed0c-11'}

In [None]:
merged[merged.Structure_Name == "Mpro-P0012"]