Retrieve all the approved drugs from ChEMBL as well as the date of approval and the SMILES.


In [1]:
import pandas as pd
import datamol as dm

from chembl_webresource_client.new_client import new_client as client


In [2]:
# First, we retrieve the ChEMBL IDs for all the approved drugs (max_phase=4)
mol_ids = client.molecule.filter(max_phase=4).only(["molecule_chembl_id"])
mol_ids = pd.DataFrame(mol_ids)

len(mol_ids)


4192

In [3]:
# Now retrieve other columns of interest

columns = ["molecule_chembl_id", "pref_name", "first_approval", "molecule_structures", "molecule_type"]


def _get_mol(molecule_chembl_id):
    mols = client.molecule.filter(molecule_chembl_id=molecule_chembl_id).only(columns)
    assert len(mols) == 1
    mol = mols[0]

    if mol.get("molecule_structures") is not None and "canonical_smiles" in mol.get(
        "molecule_structures", []
    ):
        mol["smiles"] = mol["molecule_structures"]["canonical_smiles"]

    if "molecule_structures" in mol:
        del mol["molecule_structures"]

    return pd.Series(mol)


mols = dm.parallelized(
    _get_mol,
    mol_ids["molecule_chembl_id"],
    n_jobs=256,
    scheduler="threads",
    progress=True,
)
mols = pd.DataFrame(mols)

mols.head()


  0%|          | 0/4192 [00:00<?, ?it/s]

Database is locked in thread 140349682980544; retrying (1/3)
Database is locked in thread 140342728836800; retrying (1/3)
Database is locked in thread 140352400905920; retrying (1/3)
Database is locked in thread 140357819938496; retrying (1/3)
Database is locked in thread 140356704265920; retrying (1/3)
Database is locked in thread 140341143402176; retrying (1/3)


Unnamed: 0,first_approval,molecule_chembl_id,molecule_type,pref_name,smiles
0,1976.0,CHEMBL2,Small molecule,PRAZOSIN,COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC
1,1984.0,CHEMBL3,Small molecule,NICOTINE,CN1CCC[C@H]1c1cccnc1
2,1990.0,CHEMBL4,Small molecule,OFLOXACIN,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
3,1964.0,CHEMBL5,Small molecule,NALIDIXIC ACID,CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21
4,1965.0,CHEMBL6,Small molecule,INDOMETHACIN,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1


In [5]:
# Let's focus on small molecules with a valid SMILES and a first approval date
mols = mols.query("molecule_type == 'Small molecule' & smiles.notna() & first_approval.notna()")
mols = mols.reset_index(drop=True)

mols

Unnamed: 0,first_approval,molecule_chembl_id,molecule_type,pref_name,smiles
0,1976.0,CHEMBL2,Small molecule,PRAZOSIN,COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC
1,1984.0,CHEMBL3,Small molecule,NICOTINE,CN1CCC[C@H]1c1cccnc1
2,1990.0,CHEMBL4,Small molecule,OFLOXACIN,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
3,1964.0,CHEMBL5,Small molecule,NALIDIXIC ACID,CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21
4,1965.0,CHEMBL6,Small molecule,INDOMETHACIN,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1
...,...,...,...,...,...
2623,2015.0,CHEMBL5095048,Small molecule,AMPHETAMINE ASPARTATE/DEXTROAMPHETAMINE SULFATE,CC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1...
2624,2022.0,CHEMBL5095049,Small molecule,PACRITINIB CITRATE,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2cccc...
2625,2021.0,CHEMBL5095050,Small molecule,FINGOLIMOD LAURYL SULFATE,CCCCCCCCCCCCOS(=O)(=O)O.CCCCCCCCc1ccc(CCC(N)(C...
2626,2022.0,CHEMBL5095051,Small molecule,VENLAFAXINE BESYLATE,COc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1.O=S(=O)(O)c1c...


In [7]:
# Save as Parquet
mols.to_parquet("../docs/tutorials/data/chembl_approved_drugs.parquet", index=False)