In [4]:
import requests
import pandas as pd
import json
import os
from tqdm.notebook import tqdm

In [13]:
os.makedirs("../data/raw/", exist_ok=True)
base_url = "https://www.ebi.ac.uk"

In [6]:
# Workflow: get logP +doc_id mols -> get docids from mols ids

#### Get molecules w/ logP

In [33]:
def fetch_molecules_with_logp_and_docs(limit=100, max_pages=None, verbose=True):
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_documents__isnull=false&limit={limit}"
    molecules = []
    page = 0
    
    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta']['next']
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

def fetch_molecules_with_logp_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

In [42]:
def fetch_molecules_with_logp_smiles_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    # Fetch SMILES and Patent IDs for each molecule
    molecules_with_smiles_and_patents = []
    for molecule in molecules:
        patent_ids = []
        smiles = None
        
        # Get SMILES from molecule_structures
        if 'molecule_structures' in molecule:
            smiles = molecule['molecule_structures'].get('canonical_smiles', None)
        
        # Get patent IDs from molecule_patents
        if 'molecule_patents' in molecule:
            patent_ids = [patent['patent_chembl_id'] for patent in molecule['molecule_patents']]
        
        # Add SMILES and patent IDs to molecule data
        molecule['smiles'] = smiles
        molecule['patent_ids'] = patent_ids
        molecules_with_smiles_and_patents.append(molecule)

    return molecules_with_smiles_and_patents


In [43]:
# Fetch the first 100 molecules with logP
molecules = fetch_molecules_with_logp_and_patents(limit=100, max_pages=1)

# Save raw data
with open("../data/raw/chembl_logp_molecules.json", "w") as f:
    json.dump(molecules, f, indent=2)

print(f"Fetched {len(molecules)} molecules with logP.")

Fetching page 1...
Fetched 100 molecules with logP.


In [44]:
df = pd.DataFrame(molecules)
df.columns

Index(['atc_classifications', 'availability_type', 'biotherapeutic',
       'cross_references', 'dosed_ingredient', 'first_approval',
       'first_in_class', 'helm_notation', 'indication_class', 'inorganic_flag',
       'max_phase', 'molecule_chembl_id', 'molecule_hierarchy',
       'molecule_properties', 'molecule_structures', 'molecule_synonyms',
       'molecule_type', 'natural_product', 'oral', 'orphan', 'parenteral',
       'polymer_flag', 'pref_name', 'prodrug', 'structure_type',
       'therapeutic_flag', 'topical', 'usan_stem', 'usan_stem_definition',
       'usan_substem', 'usan_year', 'withdrawn_flag', 'patent_ids'],
      dtype='object')

In [45]:
df = df[['molecule_structures', 'molecule_chembl_id', 'patent_ids']]

In [46]:
df

Unnamed: 0,molecule_structures,molecule_chembl_id,patent_ids
0,{'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...,CHEMBL6329,[]
1,{'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...,CHEMBL6328,[]
2,{'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...,CHEMBL265667,[]
3,{'canonical_smiles': 'Cc1ccc(C(=O)c2ccc(-n3ncc...,CHEMBL6362,[]
4,{'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...,CHEMBL267864,[]
...,...,...,...
95,{'canonical_smiles': 'CCCC(C)NC(=O)OCC1CN(C(=O...,CHEMBL269133,[]
96,{'canonical_smiles': 'O=C(C1CCCCN1)N1CCN(Cc2cc...,CHEMBL6346,[]
97,{'canonical_smiles': 'c1cncc(CN2CCN(C[C@@H]3CC...,CHEMBL414181,[]
98,{'canonical_smiles': 'c1cc(CN2CCN(C[C@@H]3CCCN...,CHEMBL6334,[]


In [47]:
# Test with a known molecule Chembl ID (replace with one you know should have patents)
chembl_id = "CHEMBL6329"
url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
res = requests.get(url)
res.raise_for_status()
data = res.json()
print(data)


{'documents': [{'abstract': '', 'authors': None, 'chembl_release': {'chembl_release': 'CHEMBL_7', 'creation_date': '2010-09-29'}, 'contact': None, 'doc_type': 'DATASET', 'document_chembl_id': 'CHEMBL1158643', 'doi': None, 'doi_chembl': None, 'first_page': None, 'issue': None, 'journal': None, 'journal_full_title': None, 'last_page': None, 'patent_id': None, 'pubmed_id': None, 'src_id': 0, 'title': 'Unpublished dataset', 'volume': None, 'year': None}, {'abstract': '', 'authors': 'Clader JW.', 'chembl_release': {'chembl_release': 'CHEMBL_1', 'creation_date': '2009-09-03'}, 'contact': None, 'doc_type': 'PUBLICATION', 'document_chembl_id': 'CHEMBL1139451', 'doi': '10.1021/jm030283g', 'doi_chembl': None, 'first_page': '1', 'issue': '1', 'journal': 'J Med Chem', 'journal_full_title': 'Journal of medicinal chemistry.', 'last_page': '9', 'patent_id': None, 'pubmed_id': 14695813, 'src_id': 1, 'title': 'The discovery of ezetimibe: a view from outside the receptor.', 'volume': '47', 'year': 2004}

#### Get document_id for subset

In [21]:
# Function to fetch documents associated with a molecule
def get_documents_for_molecule(chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("documents", [])

In [None]:
chembl_id = "CHEMBL25"
documents = get_documents_for_molecule(chembl_id)
documents

In [24]:
# Add document IDs to each molecule
def fetch_documents_for_molecules(molecules):
    doc_map = {}
    for mol in molecules:
        chembl_id = mol.get("molecule_chembl_id")
        if chembl_id:
            docs = get_documents_for_molecule(chembl_id)
            doc_ids = [d["document_chembl_id"] for d in docs]
            doc_map[chembl_id] = doc_ids
    return doc_map

In [25]:
doc_map = fetch_documents_for_molecules(molecules)

In [None]:
doc_map

In [27]:
df = pd.DataFrame(molecules)

In [28]:
df["logP"] = df["molecule_properties"].apply(
    lambda x: x.get("alogp") if isinstance(x, dict) else None
)

In [30]:
df["document_ids"] = df["molecule_chembl_id"].map(doc_map)

# Show merged DataFrame
df_merged = df[["molecule_chembl_id", "logP", "document_ids"]]
print(df_merged.head())

  molecule_chembl_id  logP                                       document_ids
0         CHEMBL6329  2.11  [CHEMBL1158643, CHEMBL1139451, CHEMBL1148466, ...
1         CHEMBL6328  1.33  [CHEMBL1158643, CHEMBL1139451, CHEMBL1148466, ...
2       CHEMBL265667  2.27  [CHEMBL1158643, CHEMBL1139451, CHEMBL1148466, ...
3         CHEMBL6362  1.46  [CHEMBL1158643, CHEMBL1139451, CHEMBL1148466, ...
4       CHEMBL267864  2.11  [CHEMBL1158643, CHEMBL1139451, CHEMBL1148466, ...


#### Get molecules from document_id

In [31]:
def get_molecules_for_document(document_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule_document.json?document_chembl_id={document_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("molecule_documents", [])

In [32]:
# Example: Get molecules associated with a document
document_id = "CHEMBL1158643"
molecule_docs = get_molecules_for_document(document_id)

# Print out the molecule ChEMBL IDs
molecule_ids = [doc["molecule_chembl_id"] for doc in molecule_docs]
print(f"Molecule ChEMBL IDs for document {document_id}: {molecule_ids}")

HTTPError: 404 Client Error: Not Found for url: https://www.ebi.ac.uk/chembl/api/data/molecule_document.json?document_chembl_id=CHEMBL1158643