# Imports

In [None]:
import pandas as pd
from asapdiscovery.data.schema.ligand import Ligand
from asapdiscovery.data.readers.molfile import MolFileFactory
from rdkit.Chem import Draw, rdMolAlign, rdDepictor
from rdkit import Chem

from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict

In [None]:
from pathlib import Path
data = Path("data")

# Load the Ligands

In [None]:
mff = MolFileFactory(filename=data / "combined_3d.sdf")
ligs = mff.load()

In [None]:
unique_smiles = set([(lig.smiles, lig.compound_name) for lig in ligs])

In [None]:
dict_data = [{"smiles": lig.smiles, 
              "compound_name": lig.compound_name, 
              "series": lig.tags['xtal_name'][5], 
              "number": lig.tags['xtal_name'].split("_")[0][6:], 
              "xtal_id": lig.tags['xtal_name'].split("_")[1], 
              "xtal_name": lig.tags['xtal_name'], 
              "lig": lig,
              "structure_name": lig.tags['xtal_name'][:-3]} for lig in ligs]

In [None]:
df = pd.DataFrame.from_records(dict_data)

In [None]:
df = df[df["series"].isin(["x", "P"])]

In [None]:
unique_compounds = df.sort_values(["series", "number", "xtal_id"], ascending=[True, False, True]).groupby("compound_name").head(1).groupby("smiles").head(1)

In [None]:
unique_compounds.groupby("series").count()

In [None]:
unique_compounds

# save as smiles

In [None]:
with open(data / "unique_compounds.smi", "w") as f:
    for smiles, compound_name in unique_compounds[["smiles", "compound_name"]].values:
        f.write(f"{smiles} {compound_name.replace(' ', '_')}\n")

# Save as csv

In [None]:
unique_compounds.to_csv(data / "unique_compounds.csv", index=False)