# Intro

## 1) First we can download the covid moonshot data using asapdiscovery

```
$ download-fragalysis-data -t mpro ...
```

For structure-based work, I've decided to use the `Mpro_soaks.csv` file

## 2) The next step is processing the ligand info into the asapdiscovery schema

# Import `Mpro_soaks` into a dataframe

## YOUR LOCAL PATH HERE

In [None]:
from pathlib import Path
frag_path = Path("/Users/alexpayne/Scientific_Projects/mers-drug-discovery/fragalysis_downloads/20240129_fragalysis_download/")

## load into pandas dataframe

In [None]:
import pandas as pd
soaks = pd.read_csv(frag_path / "extra_files/Mpro_soaks.csv")

In [None]:
# We want there to be a Compound ID, a Sample Name, and for data collection to be a success
filtered = soaks[
(-soaks["Compound ID"].isna())
&(-soaks["Sample Name"].isna())
&(soaks["Data Collection Outcome"] == "success")
&(soaks["Compound ID"] != "Apo")
&(-soaks["Fragalysis Link"].isna())
&(soaks["Refinement Outcome"].isin(["5 - Deposition ready", "6 - Deposited"]))
]

In [None]:
# Keep only the p-series structures
p_only = filtered[filtered["Sample Name"].str.contains("Mpro-P")]

In [None]:
# for some reason these two dataset entries aren't quite right so I'm going to manually keep them
final = pd.concat([p_only, 
                   soaks[soaks["Sample Name"] == 'Mpro-P0047'],
                   soaks[soaks["Sample Name"] == 'Mpro-P2607']])

In [None]:
len(final)

In [None]:
# save the p_series to a csv in data
csv_path = Path("../data/covid_moonshot_p_series.csv")
final.to_csv(csv_path)

# Use the asapdiscovery factories to read moonshot data

In [None]:
from asapdiscovery.data.schema_v2.fragalysis import FragalysisFactory

In [None]:
ff = FragalysisFactory(parent_dir=frag_path)

In [None]:
targets = ff.load()

In [None]:
targets = [target for target in targets 
           if "Mpro-P" in target.target.target_name
          if len(target.target.data) > 0]

In [None]:
# use the first target structure
single_targets = {}
for target in targets:
    if not single_targets.get(target.ligand.compound_name, None):
        single_targets[target.ligand.compound_name] = target

In [None]:
len(single_targets)

## now both datasets have the same number of structures : 220

## but do they have the same compound IDs?

In [None]:
# from the soaks csv
soaksset = set(final["Compound ID"].unique())

# from the FragalysisFactory
ffset = set([t.ligand.compound_name for t in single_targets.values()])

In [None]:
soaksset - ffset

In [None]:
ffset - soaksset

## no! Of course they don't.

In [None]:
from collections import namedtuple

In [None]:
id_pair = namedtuple("Correct", "Incorrect")

In [None]:
correct_compound_ids = [id_pair("MAT-POS-7174c657-5", "MAT-POS-7174c657-6"),
                        id_pair("MAT-POS-7174c657-6", "

# So now we update the list of Complexes with the correct compound names

In [None]:
from asapdiscovery.data.schema_v2.ligand import Ligand

In [None]:
for target in single_targets.values():
    sample_name = target.target.target_name.split("_")[0]
    soak_info = soaks[soaks["Sample Name"] == sample_name]
    soaks_smiles = soak_info["SMILES"].to_list()[0]
    cmpd_name = soak_info["Compound ID"].to_list()[0]
    lig = Ligand.from_smiles(smiles=soaks_smiles, compound_name=cmpd_name)
    if not lig.compound_name == target.ligand.compound_name:
        print(target.target.target_name)
        print(lig.compound_name, lig.smiles)
        print(target.ligand.compound_name, target.ligand.smiles)

## Mpro-P2141_0A and Mpro-P2176_0A are duplicates of each other

In [None]:
cmpd_to_frag = {target.ligand.compound_name: target.target.target_name.split("_")[0] for target in single_targets.values()}

In [None]:
import yaml
with open("../data/20240201_cmpd_to_frag_from_metadata.yaml", "w") as f:
    yaml.safe_dump(cmpd_to_frag, f)

In [None]:
to_check = ["EDG-MED-5d232de5-3",
            "LON-WEI-adc59df6-47",
            "EDJ-MED-705e09b8-1",
            "EDG-MED-5d232de5-6",
            "VLA-UCB-34f3ed0c-11",
            "MAT-POS-7174c657-6",
            "MAT-POS-7174c657-5",
            "BEN-DND-4f474d93-1"
           ]
            

In [None]:
[cmpd_id in list(cmpd_to_frag.keys()) for cmpd_id in to_check]

# Manual Curration
- So we only keep P2176, removing "MAT-POS-7174c657-6" entry
- use "BEN-DND-4f474d93-1" instead of "ALP-POS-ce760d3f-2"

In [None]:
_ = single_targets.pop("MAT-POS-7174c657-6")

In [None]:
to_change = single_targets.pop("ALP-POS-ce760d3f-2")

In [None]:
single_targets.get("ALP-POS-ce760d3f-2", False)

In [None]:
to_change.ligand.compound_name = "BEN-DND-4f474d93-1"

In [None]:
single_targets["BEN-DND-4f474d93-1"] = to_change

In [None]:
single_targets.get("ALP-POS-ce760d3f-2", False)

In [None]:
cmplx = single_targets.get("BEN-DND-4f474d93-1", False)

In [None]:
cmplx.unique_name

In [None]:
for data in single_targets.values():
    data.to_json_file(f"../data/20240202_fragalysis_p_series_schema/{data.unique_name}.json")