In [1]:
import sys
sys.path.append("../src")

from utils.chembl_rescue import rescue_from_chembl, load_or_init_chembl_cache, save_chembl_cache
from utils.molecule_cleanup import classify_relationship, resolve_molecule_type
from utils.pubchem_utils import fetch_pubchem_cid_from_chembl, check_peptide_by_pubchem, process_id, load_or_init_pubchem_cache, save_pubchem_cache

import chembl_downloader
import re
import pandas as pd
import time
import os
import subprocess
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.core.magic import register_cell_magic  # Needed for %%skip Jupyter cell magic

@register_cell_magic
def skip(line, cell):
    """A cell magic that skips execution of the cell body."""
    print("Skipped cell")

if not os.path.exists("../data/df_chembl.csv") or not os.path.exists("../data/sql_chembl_cache.parquet"):
    print("Downloading cache files...")
    result = subprocess.run(["bash", "../get_cache.sh"], capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print("Errors:", result.stderr)

In [2]:
sql_chembl_cache_path = "../data/sql_chembl_cache.parquet"

sql_chembl = """
SELECT
    a.pchembl_value AS "pChEMBL",
    a.potential_duplicate AS "Duplicate? (1/0)",
    a.data_validity_comment AS "Data validity comment",
    ass.chembl_id AS "Assay ChEMBL ID",
    ass.confidence_score,
    dm.mechanism_of_action AS "MOA",
    dm.direct_interaction AS "Direct interaction (1/0)?",
    dm.disease_efficacy AS "Disease efficacy (1/0)?",
    md.chembl_id AS "Molecule ChEMBL ID",
    md.pref_name AS "Preferred name",
    md.molecule_type AS "Molecule type",
    md.max_phase AS "Max phase",
    md.availability_type AS "Availability (2/1/0/-1/-2)?",
    md.therapeutic_flag AS "Therapeutic (1/0)?",
    md.first_in_class AS "First-in-class (1/0/-1)?",
    md.chemical_probe AS "Chemical probe (1/0)?",
    md.orphan AS "Orphan (1/0/-1)?",
    cs.canonical_smiles AS "Canonical SMILES",
    cp.full_mwt AS "Molecular weight of full compound",
    cp.full_molformula AS "Molecular formula of full compound",
    md_parent.chembl_id AS "Parent compound ChEMBL ID",
    md_parent.molecule_type AS parent_molecule_type,
    md_active.chembl_id AS "Active ingredient ChEMBL ID",
    mh.molregno,
    mh.parent_molregno,
    mh.active_molregno,
    td.chembl_id AS "Target ChEMBL ID"
FROM activities a
LEFT JOIN assays ass ON a.assay_id = ass.assay_id
LEFT JOIN target_dictionary td ON ass.tid = td.tid
LEFT JOIN drug_mechanism dm ON a.molregno = dm.molregno
LEFT JOIN molecule_dictionary md ON a.molregno = md.molregno
LEFT JOIN compound_structures cs ON md.molregno = cs.molregno
LEFT JOIN compound_properties cp ON cs.molregno = cp.molregno
LEFT JOIN molecule_hierarchy mh ON md.molregno = mh.molregno
LEFT JOIN molecule_dictionary md_parent ON mh.parent_molregno = md_parent.molregno
LEFT JOIN molecule_dictionary md_active ON mh.active_molregno = md_active.molregno
WHERE a.pchembl_value IS NOT NULL
  AND td.target_type = 'SINGLE PROTEIN'
  AND td.organism = 'Homo sapiens';
"""
if os.path.exists(sql_chembl_cache_path):
    print("Loading cached ChEMBL data...")
    df_raw = pd.read_parquet(sql_chembl_cache_path)
    print("Data loaded.")
else:
    print("Running SQL query...")
    df_raw = chembl_downloader.query(sql_chembl)
    print("Query complete. Saving to cache...")
    df_raw.to_parquet(sql_chembl_cache_path, index=False)
    print("Save complete.")

for col in df_raw.columns:
    print('\nColumn:', df_raw[col].value_counts(dropna=False).head(10))  # Show top 10 frequent values

Loading cached ChEMBL data...
Data loaded.

Column: pChEMBL
4.40    62704
4.50    52019
4.60    51015
4.45    41123
4.90    40848
4.80    37562
5.00    37292
4.55    35516
4.70    30871
4.85    26617
Name: count, dtype: int64

Column: Duplicate? (1/0)
0    2005843
1      95016
Name: count, dtype: int64

Column: Data validity comment
None                  2100539
Manually validated        320
Name: count, dtype: int64

Column: Assay ChEMBL ID
CHEMBL1614458    76019
CHEMBL1614421    49886
CHEMBL1613914    39824
CHEMBL1614544    36448
CHEMBL1613842    33737
CHEMBL1614038    24248
CHEMBL1614502    24121
CHEMBL1614079    23084
CHEMBL1613838    22000
CHEMBL1614250    20674
Name: count, dtype: int64

Column: confidence_score
9    1623093
8     477766
Name: count, dtype: int64

Column: MOA
None                                                     1961774
Tyrosine-protein kinase receptor FLT3 inhibitor             7102
Stem cell growth factor receptor inhibitor                  4698
Tyrosine-pro

In [3]:
df_raw = df_raw[df_raw["Duplicate? (1/0)"] != 1].copy()  # Drop likely duplicates 
active_ingredients = set(df_raw.loc[df_raw["molregno"] != df_raw["active_molregno"], "active_molregno"])  # Flag prodrugs
df_raw["Active ingredient of prodrug?"] = df_raw["molregno"].apply(
    lambda x: "Yes" if x in active_ingredients else "No"
)
df_raw["Molecule type"] = df_raw.apply(resolve_molecule_type, axis=1)  # Resolve molecule type

# Identify molecules needing rescue
df_dropped = df_raw[df_raw["Molecule type"].isna()].copy()  
dropped_ids = df_dropped["Molecule ChEMBL ID"].dropna().unique().tolist()

# Load or initialise cache
cache_path = "../data/df_rescued_cache.csv"
rescue_cache, _ = load_or_init_chembl_cache(cache_path)

rescued_rows = []
unrecoverable_ids = []
for chembl_id in dropped_ids:
    if chembl_id in rescue_cache:
        row = rescue_cache[chembl_id]
        rescued = (chembl_id, row["Rescued Formula"], row["Rescued SMILES"], row["Rescued Name"], row["Rescued MW"])
        source = "CACHE"
    else:
        rescued = rescue_from_chembl(chembl_id)
        time.sleep(0.5)
        rescue_cache[chembl_id] = {
            "Molecule ChEMBL ID": chembl_id,
            "Rescued Formula": rescued[1],
            "Rescued SMILES": rescued[2],
            "Rescued Name": rescued[3],
            "Rescued MW": rescued[4]
        }
        source = "SCRAPED"
    if any(pd.notna(v) for v in rescued[1:]):
        print(f"[{source}] Rescued {chembl_id}: Formula={rescued[1]}, SMILES={rescued[2]}, Name={rescued[3]}, MW={rescued[4]}")
    else:
        unrecoverable_ids.append(chembl_id)
    rescued_rows.append(rescued)

# Save updated cache
save_chembl_cache(rescue_cache, cache_path)

# Merge rescued data into df_raw
rescue_cols = ["Rescued Formula", "Rescued SMILES", "Rescued Name", "Rescued MW"]
df_rescued = pd.DataFrame(rescued_rows, columns=[
    "Molecule ChEMBL ID", "Rescued Formula", "Rescued SMILES", "Rescued Name", "Rescued MW"
])
if not df_rescued.empty:
    # Remove existing rescue columns if they exist before merging again
    for col in rescue_cols:
        if col in df_raw.columns:
            df_raw.drop(columns=col, inplace=True)
    df_raw = df_raw.merge(df_rescued, on="Molecule ChEMBL ID", how="left")

# Ensure rescue columns exist before combine_first for idempotency
for col in rescue_cols:
    if col not in df_raw.columns:
        df_raw[col] = pd.NA

# Proceed with safe combine
df_raw["Molecular formula of full compound"] = df_raw["Molecular formula of full compound"].combine_first(df_raw["Rescued Formula"])
df_raw["Canonical SMILES"] = df_raw["Canonical SMILES"].combine_first(df_raw["Rescued SMILES"])
df_raw["Preferred name"] = df_raw["Preferred name"].combine_first(df_raw["Rescued Name"])
df_raw["Molecular weight of full compound"] = df_raw["Molecular weight of full compound"].combine_first(df_raw["Rescued MW"])

# Re-resolve molecule type after rescue
df_raw["Molecule type"] = df_raw.apply(resolve_molecule_type, axis=1)

# Final filtering
final_dropped_df = df_raw[df_raw["Molecule type"].isna()]
final_dropped_ids = final_dropped_df["Molecule ChEMBL ID"].dropna().unique()
pd.Series(final_dropped_ids).to_csv("../data/final_dropped_ids.csv", index=False)
print(
    f"Dropped {final_dropped_df.shape[0]} activity rows where Molecule type was None and unrecoverable.\n"
    f"This translates to {final_dropped_ids.shape[0]} unique molecules dropped."
)

# Clean final dataset and save into df_chembl
columns_to_drop = [
    "parent_molecule_type", "molregno", "parent_molregno", "active_molregno",
    "Rescued Formula", "Rescued SMILES", "Rescued Name", "Rescued MW", "Duplicate? (1/0)"
]
df_chembl = df_raw.drop(columns=columns_to_drop, errors="ignore")
for col in df_chembl.columns:
    print('\nColumn:', col)
    print(df_chembl[col].value_counts(dropna=False).head(10))
df_chembl.to_csv("../data/df_chembl.csv", index=False)

[CACHE] Rescued CHEMBL3138733: Formula=H34Cl6N14O2Ru3, SMILES=nan, Name=nan, MW=778.3
[CACHE] Rescued CHEMBL5086057: Formula=C95H131Cu2IN20O27S2, SMILES=nan, Name=nan, MW=2303.72
[CACHE] Rescued CHEMBL5087277: Formula=C97H127Cu2IN20O27S2, SMILES=nan, Name=nan, MW=2323.71
[CACHE] Rescued CHEMBL5089315: Formula=C97H128Cu2IN21O26S2, SMILES=nan, Name=nan, MW=2322.73
[CACHE] Rescued CHEMBL5087925: Formula=C98H131Cu2N21O26S2, SMILES=nan, Name=nan, MW=2210.86
[CACHE] Rescued CHEMBL5092357: Formula=C31H46Cu2IN5O10, SMILES=nan, Name=nan, MW=903.11
[CACHE] Rescued CHEMBL5196854: Formula=C21H14FN5O2V, SMILES=nan, Name=nan, MW=438.32
Dropped 1133 activity rows where Molecule type was None and unrecoverable.
This translates to 508 unique molecules dropped.

Column: pChEMBL
pChEMBL
4.40    62635
4.50    51983
4.60    50865
4.45    41109
4.90    40808
4.80    37485
5.00    36934
4.55    35441
4.70    30743
4.85    26539
Name: count, dtype: int64

Column: Data validity comment
Data validity comment
No

In [4]:
%%skip

df_chembl = pd.read_csv("../data/df_chembl.csv")
unique_chembl_ids = (
    df_chembl[df_chembl["Molecule type"] == "TBD"]["Molecule ChEMBL ID"]
    .dropna()
    .drop_duplicates()
    .tolist()
)

cache_path = "../data/pubchem_peptide_inference_cache.csv"
if os.path.exists(cache_path):
    df_cache = pd.read_csv(cache_path)
    df_cache = df_cache.drop_duplicates("Molecule ChEMBL ID")
    cached_ids = set(df_cache["Molecule ChEMBL ID"])
else:
    df_cache = pd.DataFrame(columns=["Molecule ChEMBL ID", "PubChem CID", "Is Peptide", "Last Updated"])
    cached_ids = set()

uncached_ids = [chembl_id for chembl_id in unique_chembl_ids if chembl_id not in cached_ids]
print(f"Total molecules to check: {len(uncached_ids)}")

# Process uncached ChEMBL IDs
batch_size = 1000
total_batches = (len(uncached_ids) + batch_size - 1) // batch_size
new_results = []
print(f"Total molecules to check: {len(uncached_ids)}")

for i in range(0, len(uncached_ids), batch_size):
    batch_number = i // batch_size + 1
    batch = uncached_ids[i:i + batch_size]
    print(f"Processing batch {batch_number} out of {total_batches} ({len(batch)} molecules)")
    for chembl_id in batch:
        cid = fetch_pubchem_cid_from_chembl(chembl_id)
        is_peptide = check_peptide_by_pubchem(cid) if cid else False
        timestamp = datetime.utcnow().isoformat()
        new_results.append((chembl_id, cid, is_peptide, timestamp))
        time.sleep(0.3)
    # Append to cache
    df_new = pd.DataFrame(new_results, columns=["Molecule ChEMBL ID", "PubChem CID", "Is Peptide", "Last Updated"])
    df_cache = pd.concat([df_cache, df_new], ignore_index=True).drop_duplicates("Molecule ChEMBL ID")
    df_cache.to_csv(cache_path, index=False)
    print(f"Added {len(df_new)} entries to cache.")
    new_results = []
print("PubChem peptide inference complete.")

Skipped cell


In [5]:
df_chembl = pd.read_csv("../data/df_chembl.csv", low_memory=False)
unique_chembl_ids = (
    df_chembl[df_chembl["Molecule type"] == "TBD"]["Molecule ChEMBL ID"]
    .dropna()
    .drop_duplicates()
    .tolist()
)

cache_path = "../data/pubchem_peptide_inference_cache.csv"
df_cache, cached_ids = load_or_init_pubchem_cache(cache_path)

uncached_ids = sorted(set(unique_chembl_ids) - cached_ids)
batch_size = 25
max_workers = 10

for i in range(0, len(uncached_ids), batch_size):
    batch = uncached_ids[i:i + batch_size]
    print(f"Processing batch {i//batch_size + 1} of {len(uncached_ids) // batch_size + 1}...")
    new_results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_id, chembl_id) for chembl_id in batch]
        for future in as_completed(futures):
            try:
                new_results.append(future.result())
            except Exception as e:
                print(f"Error: {e}")
    df_new = pd.DataFrame(new_results, columns=["Molecule ChEMBL ID", "PubChem CID", "Is Peptide", "Last Updated"])
    df_cache = pd.concat([df_cache, df_new], ignore_index=True).drop_duplicates("Molecule ChEMBL ID")
    save_pubchem_cache(df_cache, cache_path)
print("PubChem peptide inference complete.")

Processing batch 1 of 1943...
CID fetch failed for CHEMBL5076154 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076154/cids/JSONCID fetch failed for CHEMBL5076148 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076148/cids/JSON
CID fetch failed for CHEMBL5076161 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076161/cids/JSON

CID fetch failed for CHEMBL5076146 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076146/cids/JSON
CID fetch failed for CHEMBL5076151 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076151/cids/JSON
CID fetch failed for CHEMBL5076149 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076149/ci

  df_cache = pd.concat([df_cache, df_new], ignore_index=True).drop_duplicates("Molecule ChEMBL ID")


Processing batch 2 of 1943...
CID fetch failed for CHEMBL5076212 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076212/cids/JSON
CID fetch failed for CHEMBL5076213 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076213/cids/JSON
CID fetch failed for CHEMBL5076226 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076226/cids/JSON
CID fetch failed for CHEMBL5076223 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076223/cids/JSON
CID fetch failed for CHEMBL5076218 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076218/cids/JSON
CID fetch failed for CHEMBL5076220 — 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL5076220/ci

KeyboardInterrupt: 

In [None]:
print("Running update_cache.sh to upload latest cache files...")
result = subprocess.run(["../update_cache.sh"], capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("Errors:", result.stderr)