In [6]:
import requests
import pandas as pd
import json
import os
from tqdm.notebook import tqdm

In [15]:
os.makedirs("../data/raw/", exist_ok=True)

In [19]:
def fetch_molecules_with_logp_and_docs(limit=1000, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_documents__isnull=false&limit={limit}"
    molecules = []

    page = 0
    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta']['next']
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules


In [20]:
# Fetch the first ~10,000 molecules with logP
molecules = fetch_molecules_with_logp_and_docs(limit=200, max_pages=10)

# Save raw data
with open("../data/raw/chembl_logp_molecules.json", "w") as f:
    json.dump(molecules, f, indent=2)

print(f"Fetched {len(molecules)} molecules with logP.")


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetched 10000 molecules with logP.


In [None]:
# Function to fetch documents associated with a molecule
def get_documents_for_molecule(chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("documents", [])

# Add document IDs to each molecule
doc_map = {}
for mol in molecules:
    chembl_id = mol.get("molecule_chembl_id")
    if chembl_id:
        docs = get_documents_for_molecule(chembl_id)
        doc_ids = [d["document_chembl_id"] for d in docs]
        doc_map[chembl_id] = doc_ids


In [22]:
df = pd.DataFrame(molecules)

In [24]:
df.columns

Index(['atc_classifications', 'availability_type', 'biotherapeutic',
       'cross_references', 'dosed_ingredient', 'first_approval',
       'first_in_class', 'helm_notation', 'indication_class', 'inorganic_flag',
       'max_phase', 'molecule_chembl_id', 'molecule_hierarchy',
       'molecule_properties', 'molecule_structures', 'molecule_synonyms',
       'molecule_type', 'natural_product', 'oral', 'orphan', 'parenteral',
       'polymer_flag', 'pref_name', 'prodrug', 'structure_type',
       'therapeutic_flag', 'topical', 'usan_stem', 'usan_stem_definition',
       'usan_substem', 'usan_year', 'withdrawn_flag'],
      dtype='object')