In [2]:
import requests
import pandas as pd
import json
import os
import re
import time
import seaborn as sns
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from chembl_webresource_client.new_client import new_client

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version


In [3]:
assay = new_client.assay
activity = new_client.activity
molecule = new_client.molecule

#### Extract assay type A in Human

In [None]:
batch_size = 20
offset = 0
max_records = 100000  # or None for full
results = []
rawfile = "../data/raw/test_100k.csv"

while True:
    print(f"Fetching records {offset} to {offset + batch_size}...")
    
    batch = assay.filter(
        assay_type='A',
        assay_organism__iexact='Homo sapiens'
    ).only(['assay_type', 'description', 'assay_chembl_id', 'assay_organism'])[offset:offset + batch_size]

    if not batch:
        break

    df_batch = pd.DataFrame(batch)
    results.append(df_batch)

    offset += batch_size
    if max_records and offset >= max_records:
        break

    time.sleep(0.5)  # adjust delay if needed

if results:
    final_df = pd.concat(results, ignore_index=True)
    final_df.to_csv(rawfile, index=False)
    print(f"\n✅ Wrote {len(final_df)} records to {rawfile}")
else:
    print("\n⚠️ No data retrieved.")

#### Retrieve HLM activities

In [6]:
%%time
df = pd.read_csv(rawfile)
#pattern = r'(?i)\b(?:CL|microsome|HLM_CL|HLM_clearance|HLM_half_life|HLM_stability)\b' # more specific
pattern = r'(?i)\b(?:CL|microsome|HLM|stability)\b' # broader
hlm_df = df[df['description'].str.contains(pattern, na=False)]
hlm_assay_ids = hlm_df['assay_chembl_id'].tolist()
hlm_df.shape

CPU times: user 263 ms, sys: 19.6 ms, total: 283 ms
Wall time: 282 ms


(7037, 4)

In [None]:
batch_size = 1000  # Adjust as needed
all_activities = []

for i in range(0, len(hlm_assay_ids), batch_size):
    batch = hlm_assay_ids[i:i + batch_size]
    print(f"Fetching batch {i // batch_size + 1} of {len(hlm_assay_ids) // batch_size + 1}")
    res = activity.filter(assay_chembl_id__in=batch).only([
        'molecule_chembl_id', 'assay_chembl_id',
        'standard_value', 'standard_units', 'standard_type',
        'standard_relation', 'document_chembl_id', 'description'
    ])
    all_activities.extend(res)
    time.sleep(0.5)  # Avoid hammering the API

activities_df = pd.DataFrame(all_activities)
activities_df.shape

In [None]:
activities_df.to_csv('../data/mols_hlm.csv', index=False)

#### Retrieve molecules

In [7]:
activities_df = pd.read_csv("../data/mols_hlm.csv")

In [9]:
unique_molecule_ids = activities_df['molecule_chembl_id'].unique().tolist()
len(unique_molecule_ids)

26608

In [11]:
res = molecule.filter(chembl_id__in=['CHEMBL152844']).only([
    'molecule_chembl_id',
    'molecule_structures'
])

In [12]:
len(res)

2496335

In [None]:
mols_df = pd.DataFrame(res)
mols_df['mol_smi'] = mols_df['molecule_structures'].apply(lambda x: x.get('canonical_smiles') if isinstance(x, dict) else None)
mols_df

In [None]:
batch_size = 1000
all_molecules = []

for i in range(0, len(unique_molecule_ids), batch_size):
    batch = unique_molecule_ids[i:i + batch_size]
    print(f"Fetching batch {i // batch_size + 1} of {(len(unique_molecule_ids) + batch_size - 1) // batch_size}")
    
    res = molecule.filter(chembl_id__in=batch).only([
        'molecule_chembl_id',
        'molecule_structures'
    ])
    
    all_molecules.extend(res)
    time.sleep(0.5)  # Rate limit friendly

mols_df = pd.DataFrame(all_molecules)
print(f"Retrieved {len(mols_df)} molecules.")

#### Analysis + Plot

In [None]:
activities_df.columns

In [None]:
activities_df['standard_type'].value_counts().head(29)

In [None]:
tmp['document_chembl_id'].value_counts().head(3)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.countplot(full_data, x='standard_type')
plt.xticks(rotation=45, ha='right')

In [None]:
%%time
# 10k = 98s
assays = assay.filter(
    assay_type='A',
    assay_organism__iexact='Homo sapiens'
).only(['assay_type', 'description', 'assay_chembl_id', 'assay_organism'])[:10000]
df = pd.DataFrame(assays)

#### Filter subset

In [None]:
filt = activities_df['standard_type'] == 'CL'
tmp = activities_df[filt]

In [None]:
filt = activities_df['document_chembl_id'] == 'CHEMBL4342426'
tmp = activities_df[filt]

In [None]:
tmp.head(5)

#### Look at molecules

In [None]:
full_data['mol_smi'] = full_data['molecule_structures'].apply(lambda x: x.get('canonical_smiles') if isinstance(x, dict) else None)

In [None]:
PandasTools.AddMoleculeColumnToFrame(
    tmp, 
    smilesCol='mol_smi',  # Column containing SMILES strings
    molCol='Molecule',             # Name of new column to create
    includeFingerprints=False      # Set to True if you need fingerprints
)

#### Extract hERG

In [None]:
from chembl_webresource_client.new_client import new_client

target = new_client.target
activity = new_client.activity
herg = target.filter(pref_name__iexact='hERG').only('target_chembl_id')[0]
herg_activities = activity.filter(target_chembl_id=herg['target_chembl_id']).filter(standard_type="IC50")

herg_activities

#### Extract HLM data

In [None]:
def fetch_metabolic_assays(limit=1000):
    url = f"https://www.ebi.ac.uk/chembl/api/data/assay.json?assay_type=ADME&limit={limit}"
    assays = []
    while url:
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        for a in data["assays"]:
            if "metabolic" in (a.get("description") or "").lower():
                assays.append(a["assay_chembl_id"])
        url = data["page_meta"]["next"]
    return assays

def fetch_activities_for_assays(assay_ids, limit=1000):
    activities = []
    for assay_id in assay_ids:
        url = f"https://www.ebi.ac.uk/chembl/api/data/activity.json?assay_chembl_id={assay_id}&limit={limit}"
        while url:
            res = requests.get(url)
            res.raise_for_status()
            data = res.json()
            activities.extend(data["activities"])
            url = data["page_meta"]["next"]
    return activities

In [None]:
assay_ids = fetch_metabolic_assays()
print(f"Found {len(assay_ids)} assays likely related to metabolic stability.")

activities = fetch_activities_for_assays(assay_ids[:5])  # You can increase the slice
print(f"Retrieved {len(activities)} activity records.")

#### Get molecules w/ logP

In [None]:
def fetch_molecules_with_logp_and_docs(limit=100, max_pages=None, verbose=True):
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_documents__isnull=false&limit={limit}"
    molecules = []
    page = 0
    
    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta']['next']
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

In [None]:
molecules = fetch_molecules_with_logp_and_docs(limit=50, max_pages=1)

with open("../data/raw/chembl_logP_molecules.json", "w") as f:
    json.dump(molecules, f, indent=2)

print(f"Fetched {len(molecules)} molecules with logP.")

In [None]:
df = pd.DataFrame(molecules)
df.columns

In [None]:
df = df[['molecule_structures', 'molecule_chembl_id', 'molecule_properties']]
df.head(2)

In [None]:
# Test API with a known molecule Chembl ID
chembl_id = "CHEMBL6329"
url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
res = requests.get(url)
res.raise_for_status()
data = res.json()
documents = data.get("documents", [])
df = pd.DataFrame(documents)
df.head(1)

In [None]:
# Get all activities in CHemBL 
url = "https://www.ebi.ac.uk/chembl/api/data/activity.json?limit=1000"
res = requests.get(url)
res.raise_for_status()
data = res.json()

types = {a["standard_type"] for a in data["activities"] if a.get("standard_type")}
types = sorted(types)
for t in types:
    print(t)

#### Get document_id for subset

In [None]:
# Function to fetch documents associated with a molecule
def get_documents_for_molecule(chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("documents", [])

In [None]:
chembl_id = "CHEMBL25"
documents = get_documents_for_molecule(chembl_id)
documents

In [None]:
# Add document IDs to each molecule
def fetch_documents_for_molecules(molecules):
    doc_map = {}
    for mol in molecules:
        chembl_id = mol.get("molecule_chembl_id")
        if chembl_id:
            docs = get_documents_for_molecule(chembl_id)
            doc_ids = [d["document_chembl_id"] for d in docs]
            doc_map[chembl_id] = doc_ids
    return doc_map

In [None]:
doc_map = fetch_documents_for_molecules(molecules)

In [None]:
doc_map

In [None]:
df = pd.DataFrame(molecules)

In [None]:
df["logP"] = df["molecule_properties"].apply(
    lambda x: x.get("alogp") if isinstance(x, dict) else None
)

In [None]:
df["document_ids"] = df["molecule_chembl_id"].map(doc_map)

# Show merged DataFrame
df_merged = df[["molecule_chembl_id", "logP", "document_ids"]]
print(df_merged.head())

#### Get molecules from document_id

In [None]:
def get_molecules_for_document(document_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule_document.json?document_chembl_id={document_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("molecule_documents", [])

In [None]:
# Example: Get molecules associated with a document
document_id = "CHEMBL1158643"
molecule_docs = get_molecules_for_document(document_id)

# Print out the molecule ChEMBL IDs
molecule_ids = [doc["molecule_chembl_id"] for doc in molecule_docs]
print(f"Molecule ChEMBL IDs for document {document_id}: {molecule_ids}")

In [None]:
def fetch_molecules_with_logp_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

def fetch_molecules_with_logp_smiles_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    # Fetch SMILES and Patent IDs for each molecule
    molecules_with_smiles_and_patents = []
    for molecule in molecules:
        patent_ids = []
        smiles = None
        
        # Get SMILES from molecule_structures
        if 'molecule_structures' in molecule:
            smiles = molecule['molecule_structures'].get('canonical_smiles', None)
        
        # Get patent IDs from molecule_patents
        if 'molecule_patents' in molecule:
            patent_ids = [patent['patent_chembl_id'] for patent in molecule['molecule_patents']]
        
        # Add SMILES and patent IDs to molecule data
        molecule['smiles'] = smiles
        molecule['patent_ids'] = patent_ids
        molecules_with_smiles_and_patents.append(molecule)

    return molecules_with_smiles_and_patents