# Imports

In [1]:
import pandas as pd
from asapdiscovery.data.schema.ligand import Ligand
from asapdiscovery.data.readers.molfile import MolFileFactory
from rdkit.Chem import Draw, rdMolAlign, rdDepictor
from rdkit import Chem

from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict

In [2]:
from pathlib import Path
data = Path("data")

# Load the Ligands

In [3]:
mff = MolFileFactory(filename=data / "combined_3d.sdf")
ligs = mff.load()



In [4]:
unique_smiles = set([(lig.smiles, lig.compound_name) for lig in ligs])



In [23]:
dict_data = [{"smiles": lig.smiles, 
              "compound_name": lig.compound_name, 
              "series": lig.tags['xtal_name'][5], 
              "number": lig.tags['xtal_name'].split("_")[0][6:], 
              "xtal_id": lig.tags['xtal_name'].split("_")[1], 
              "xtal_name": lig.tags['xtal_name'], 
              "lig": lig,
              "structure_name": lig.tags['xtal_name'][:-3]} for lig in ligs]



In [24]:
df = pd.DataFrame.from_records(dict_data)

In [25]:
df = df[df["series"].isin(["x", "P"])]

In [26]:
unique_compounds = df.sort_values(["series", "number", "xtal_id"], ascending=[True, False, True]).groupby("compound_name").head(1).groupby("smiles").head(1)

In [27]:
unique_compounds.groupby("series").count()

Unnamed: 0_level_0,smiles,compound_name,number,xtal_id,xtal_name,lig,structure_name
series,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P,205,205,205,205,205,205,205
x,332,332,332,332,332,332,332


In [28]:
unique_compounds

Unnamed: 0,smiles,compound_name,series,number,xtal_id,xtal_name,lig,structure_name
618,c1ccc2c(c1)cncc2N3CCC[C@@]4(C3=O)C[N@](Cc5c4cc...,ALP-POS-133e7cd9-2,P,3074,0A,Mpro-P3074_0A,compound_name='ALP-POS-133e7cd9-2' ids=None pr...,Mpro-P3074
387,CC[C@H]1CN(C(=O)[C@@]12C[N@](Cc3c2cc(cc3)Cl)S(...,MAT-POS-50a80394-2,P,3054,0A,Mpro-P3054_0A,compound_name='MAT-POS-50a80394-2' ids=None pr...,Mpro-P3054
200,C[C@H]1CN(C(=O)[C@@]12C[N@](Cc3c2cc(cc3)Cl)S(=...,MAT-POS-50a80394-1,P,3050,0A,Mpro-P3050_0A,compound_name='MAT-POS-50a80394-1' ids=None pr...,Mpro-P3050
406,CNC(=O)C1(CC1)N2C[C@]3(CCN(C3=O)c4cncc5c4cccc5...,MAT-POS-e48723dc-2,P,3038,0A,Mpro-P3038_0A,compound_name='MAT-POS-e48723dc-2' ids=None pr...,Mpro-P3038
724,c1ccc2c(c1)cncc2N3CC[C@@]4(C3=O)C[N@H+](Cc5c4c...,ALP-POS-ecbed2ba-12,P,2730,0A,Mpro-P2730_0A,compound_name='ALP-POS-ecbed2ba-12' ids=None p...,Mpro-P2730
...,...,...,...,...,...,...,...,...
357,C[NH+]1CCN(CC1)C(=O)Nc2ccccc2,AAR-POS-0daf6b7e-44,x,0165,0A,Mpro-x0165_0A,compound_name='AAR-POS-0daf6b7e-44' ids=None p...,Mpro-x0165
732,COC(=O)c1ccc(cc1)S(=O)(=O)N,MAT-POS-7dfc56d9-1,x,0161,0A,Mpro-x0161_0A,compound_name='MAT-POS-7dfc56d9-1' ids=None pr...,Mpro-x0161
510,Cc1ccncc1NC(=O)C,MAK-UNK-6435e6c2-8,x,0107,0A,Mpro-x0107_0A,compound_name='MAK-UNK-6435e6c2-8' ids=None pr...,Mpro-x0107
782,CC(=O)NCCc1c[nH]c2c1cc(cc2)F,AAR-POS-d2a4d1df-2,x,0104,0A,Mpro-x0104_0A,compound_name='AAR-POS-d2a4d1df-2' ids=None pr...,Mpro-x0104


# save as smiles

In [29]:
with open(data / "unique_compounds.smi", "w") as f:
    for smiles, compound_name in unique_compounds[["smiles", "compound_name"]].values:
        f.write(f"{smiles} {compound_name.replace(' ', '_')}\n")

# Save as csv

In [30]:
unique_compounds.to_csv(data / "unique_compounds.csv", index=False)