In [15]:
import chembl_downloader
import pandas as pd
import re

sql_chembl = """
SELECT
    a.pchembl_value,
    a.potential_duplicate,
    a.data_validity_comment,
    ass.chembl_id AS "Assay ChEMBL ID",
    ass.confidence_score,
    dm.mechanism_of_action,
    dm.direct_interaction,
    dm.disease_efficacy,
    md.chembl_id AS "Molecule ChEMBL ID",
    md.molecule_type,
    md.max_phase,
    md.availability_type,
    md.therapeutic_flag,
    md.first_in_class,
    md.chemical_probe,
    md.orphan,
    cs.canonical_smiles,
    cp.full_mwt,
    ass.src_id,
    src.src_short_name,
    md_parent.chembl_id AS "Parent compound ChEMBL ID",
    md_parent.molecule_type AS parent_molecule_type,
    md_parent.max_phase AS parent_max_phase,
    md_parent.availability_type AS parent_availability_type,
    md_active.chembl_id AS "Active ingredient ChEMBL ID",
    mh.molregno,
    mh.parent_molregno,
    mh.active_molregno,
    td.chembl_id AS "Target ChEMBL ID"
FROM activities a
LEFT JOIN assays ass ON a.assay_id = ass.assay_id
LEFT JOIN source src ON ass.src_id = src.src_id
LEFT JOIN target_dictionary td ON ass.tid = td.tid
LEFT JOIN drug_mechanism dm ON a.molregno = dm.molregno
LEFT JOIN molecule_dictionary md ON a.molregno = md.molregno
LEFT JOIN compound_structures cs ON md.molregno = cs.molregno
LEFT JOIN compound_properties cp ON cs.molregno = cp.molregno
LEFT JOIN molecule_hierarchy mh ON md.molregno = mh.molregno
LEFT JOIN molecule_dictionary md_parent ON mh.parent_molregno = md_parent.molregno
LEFT JOIN molecule_dictionary md_active ON mh.active_molregno = md_active.molregno
WHERE a.pchembl_value IS NOT NULL
  AND td.target_type = 'SINGLE PROTEIN'
  AND td.organism = 'Homo sapiens';
"""
print("Running query...")
df_raw = chembl_downloader.query(sql_chembl)
print("Query complete.")

Running query...
Query complete.


In [18]:
active_ingredients = set(df_raw.loc[df_raw["molregno"] != df_raw["active_molregno"], "active_molregno"])
df_raw["max_phase"] = df_raw.apply(
    lambda row: row["parent_max_phase"] if row["molregno"] != row["parent_molregno"] else row["max_phase"],
    axis=1
)
df_raw["availability_type"] = df_raw.apply(
    lambda row: row["parent_availability_type"] if row["molregno"] != row["parent_molregno"] else row["availability_type"],
    axis=1
)

# Classify relationship
def classify_relationship(row):
    if pd.isna(row["parent_molregno"]):
        return "No parent info"
    elif row["molregno"] == row["parent_molregno"] and row["active_molregno"] == row["parent_molregno"]:
        return "Parent compound"
    elif row["molregno"] != row["parent_molregno"] and row["active_molregno"] == row["parent_molregno"]:
        return "Salt form"
    elif row["molregno"] == row["parent_molregno"] and row["active_molregno"] != row["parent_molregno"]:
        return "Prodrug"
    elif row["molregno"] != row["parent_molregno"] and row["active_molregno"] != row["parent_molregno"]:
        return "Prodrug salt form"
    else:
        return "Other"

def resolve_molecule_type(row):
    mt = row["molecule_type"]
    mt_parent = row["parent_molecule_type"]
    smiles = row["canonical_smiles"]
    weight = row["full_mwt"]
    src = (row["src_short_name"] or "").upper()
    if pd.notnull(mt) and mt != "Unknown":
        return mt
    elif pd.notnull(mt_parent) and mt_parent != "Unknown":
        return mt_parent
    elif pd.notnull(smiles) and pd.notnull(weight) and 100 <= weight <= 1500:
        return "Small molecule (inferred)"
    elif src in {"PUBCHEM", "PUBCHEM_BIOASSAY"}:
        return "REMOVE_PUBCHEM"
    else:
        return None

df_raw["molecule_type"] = df_raw.apply(resolve_molecule_type, axis=1)
df_raw = df_raw[df_raw["molecule_type"] != "REMOVE_PUBCHEM"]
df_raw["Relationship"] = df_raw.apply(classify_relationship, axis=1)
df_raw["Active ingredient of prodrug?"] = df_raw["molregno"].apply(lambda x: "Yes" if x in active_ingredients else None)
df_chembl = df_raw.drop(columns=[
    "parent_molecule_type", "parent_max_phase", "parent_availability_type",
    "molregno", "parent_molregno", "active_molregno", "src_short_name", "src_id"
])
df_chembl

Unnamed: 0,pchembl_value,potential_duplicate,data_validity_comment,Assay ChEMBL ID,confidence_score,mechanism_of_action,direct_interaction,disease_efficacy,Molecule ChEMBL ID,molecule_type,...,first_in_class,chemical_probe,orphan,canonical_smiles,full_mwt,Parent compound ChEMBL ID,Active ingredient ChEMBL ID,Target ChEMBL ID,Relationship,Active ingredient of prodrug?
0,5.60,0,,CHEMBL872937,8,,,,CHEMBL324340,Small molecule,...,-1,0,-1,Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...,398.37,CHEMBL324340,CHEMBL324340,CHEMBL3921,Parent compound,
1,5.05,0,,CHEMBL872937,8,,,,CHEMBL109600,Small molecule,...,-1,0,-1,COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...,520.50,CHEMBL109600,CHEMBL109600,CHEMBL3921,Parent compound,
2,5.22,0,,CHEMBL666153,8,,,,CHEMBL152968,Small molecule,...,-1,0,-1,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccc...,516.67,CHEMBL152968,CHEMBL152968,CHEMBL3356,Parent compound,
3,4.43,0,,CHEMBL665756,8,,,,CHEMBL152968,Small molecule,...,-1,0,-1,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccc...,516.67,CHEMBL152968,CHEMBL152968,CHEMBL340,Parent compound,
4,4.62,0,,CHEMBL660388,8,,,,CHEMBL152968,Small molecule,...,-1,0,-1,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccc...,516.67,CHEMBL152968,CHEMBL152968,CHEMBL3397,Parent compound,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100854,6.89,0,,CHEMBL5482947,9,,,,CHEMBL4744041,Small molecule (inferred),...,-1,0,-1,Cc1cc(-c2ccnc(Nc3cnn(C)c3)n2)ccc1CNC(=O)N1CC(O...,435.53,CHEMBL4744041,CHEMBL4744041,CHEMBL6191,Parent compound,
2100855,6.34,0,,CHEMBL5482945,9,,,,CHEMBL4744041,Small molecule (inferred),...,-1,0,-1,Cc1cc(-c2ccnc(Nc3cnn(C)c3)n2)ccc1CNC(=O)N1CC(O...,435.53,CHEMBL4744041,CHEMBL4744041,CHEMBL2971,Parent compound,
2100856,9.52,0,,CHEMBL5482939,9,,,,CHEMBL4744041,Small molecule (inferred),...,-1,0,-1,Cc1cc(-c2ccnc(Nc3cnn(C)c3)n2)ccc1CNC(=O)N1CC(O...,435.53,CHEMBL4744041,CHEMBL4744041,CHEMBL5251,Parent compound,
2100857,9.00,0,,CHEMBL5482940,9,,,,CHEMBL4744041,Small molecule (inferred),...,-1,0,-1,Cc1cc(-c2ccnc(Nc3cnn(C)c3)n2)ccc1CNC(=O)N1CC(O...,435.53,CHEMBL4744041,CHEMBL4744041,CHEMBL5251,Parent compound,


In [19]:
df_chembl.to_csv('c.csv')

In [21]:
df_chembl['molecule_type'].unique()

array(['Small molecule', 'Protein', 'Small molecule (inferred)', None,
       'Oligosaccharide', 'Oligonucleotide'], dtype=object)