# Imports

In [None]:
from pathlib import Path
import pandas as pd
from asapdiscovery.data.schema.complex import Complex, PreppedComplex
import json
from tqdm import tqdm

## your path here:

In [None]:
data_dir = Path('/Users/alexpayne/Scientific_Projects/mers-drug-discovery/sars2-retrospective-analysis/mpro_fragalysis-04-01-24_cache')
schema_paths = list(data_dir.glob("Mpro-P*/*.json"))

In [None]:
complexes = [PreppedComplex(**json.load(open(complex_json, 'r'))) for complex_json in tqdm(schema_paths)]

In [None]:
ligs = [c.ligand for c in complexes]

# Get just a single structure for each molecule

In [None]:
from collections import defaultdict
new_structures = defaultdict(None)
for complex in complexes:
    old_complex = new_structures.get(complex.ligand.smiles, None)
    if not old_complex:
        new_structures[complex.ligand.smiles] = complex
        continue
        
    old_dataset_number = old_complex.target.target_name.split("_")[0]
    old_dataset_letter = old_complex.target.target_name.split("_")[-1]
    new_dataset_number = complex.target.target_name.split("_")[0]
    new_dataset_letter = complex.target.target_name.split("_")[-1]
    if new_dataset_number > old_dataset_number:
        new_structures[complex.ligand.smiles] = complex
        continue
    if new_dataset_number == old_dataset_number and new_dataset_letter < old_dataset_letter:
        new_structures[complex.ligand.smiles] = complex
        continue

In [None]:
all_names = {c.target.target_name for c in complexes}

In [None]:
len(all_names)

In [None]:
selected_names = {complex.target.target_name for complex in new_structures.values()}

In [None]:
len(selected_names)

In [None]:
all_names - selected_names

In [None]:
complex_dict = {complex.target.target_name: complex for complex in complexes}

In [None]:
missing = all_names - selected_names

## import fragalysis duplicates

In [None]:
dups = pd.read_csv("../20231117-process-fragalysis-duplicates/20231212_fragalysis_duplicates.csv", index_col=0)

In [None]:
dups.columns

In [None]:
lig1 = dups["Lig1"].to_list()

In [None]:
lig2 = dups["Lig2"].to_list()

In [None]:
lig1_stereo = dups["Lig1 Stereospecific"].to_list()

In [None]:
lig2_stereo = dups["Lig2 Stereospecific"].to_list()

In [None]:
missing_0a = {name for name in missing if name.split("_")[-1] == "0A"}

In [None]:
missing_0a_complexes = [complex_dict[name] for name in missing_0a]

In [None]:
missing_0a_ligs = [c.ligand.compound_name for c in missing_0a_complexes]

In [None]:
preferred_dataset = []
preferred_compound = []
preferred_stereo = []
excluded_dataset = []
excluded_compound = []
excluded_stereo = []
for complex in missing_0a_complexes:
    if complex.ligand.compound_name in lig1:
        col = "Lig1"
        other_col = "Lig2"
        stereo_bool = "Lig1 Stereospecific"
        other_stereo_bool = "Lig2 Stereospecific"
    elif complex.ligand.compound_name in lig2:
        col = "Lig2"
        stereo_bool = "Lig2 Stereospecific"
        other_col = "Lig1"
        other_stereo_bool = "Lig1 Stereospecific"
    else:
        raise ValueError("Ligand not found")
    new_complex = new_structures[complex.ligand.smiles]
    stereo_dict = dups[dups[col] == complex.ligand.compound_name].to_dict(orient='records')[0]
    preferred_dataset.append(new_complex.target.target_name)
    preferred_compound.append(new_complex.ligand.compound_name)
    preferred_stereo.append("Stereospecific" if stereo_dict[other_stereo_bool] else "Racemic")
    excluded_dataset.append(complex.target.target_name)
    excluded_compound.append(complex.ligand.compound_name)
    excluded_stereo.append("Stereospecific" if stereo_dict[stereo_bool] else "Racemic")

df = pd.DataFrame({
    "Preferred Dataset": preferred_dataset,
    "Preferred Compound": preferred_compound,
    "Preferred Stereo": preferred_stereo,
    "Excluded Dataset": excluded_dataset,
    "Excluded Compound": excluded_compound,
    "Excluded Stereo": excluded_stereo
})

In [None]:
df

In [None]:
df.to_csv("20240203_duplicate_explanation.csv")

# Save the new structures

In [None]:
from asapdiscovery.modeling.protein_prep import ProteinPrepper

In [None]:
protein_prepper = ProteinPrepper()

In [None]:
protein_prepper.cache(list(new_structures.values()), cache_dir=data_dir.parent / "20240403_fragalysis_p_series_curated_cache")

In [None]:
with open(data_dir.parent / "20240403_fragalysis_p_series_curated_cache" / "README.md", 'w') as f:
    f.write(f"This cache was created by selecting a single structure for each ligand (identified by the smiles string, not the compound ID) in the P series from the fragalysis cache: '{data_dir.absolute()}'. The selection was based on the dataset number and letter, with the highest dataset number and lowest letter being selected (i.e. datasets that were collected later and chain A if possible). This was performed in the notebook at '{Path.cwd()}'")