In [2]:
import requests
import pandas as pd
import json
import os

In [3]:
os.makedirs("../data/raw/", exist_ok=True)
base_url = "https://www.ebi.ac.uk"

In [4]:
# Workflow: get logP +doc_id mols -> get docids from mols ids

#### Get molecules w/ logP

In [7]:
def fetch_molecules_with_logp_and_docs(limit=100, max_pages=None, verbose=True):
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_documents__isnull=false&limit={limit}"
    molecules = []
    page = 0
    
    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta']['next']
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

def fetch_molecules_with_logp_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

In [14]:
molecules = fetch_molecules_with_logp_smiles_and_patents(limit=100, max_pages=1)

# Save raw data
with open("../data/raw/chembl_logp_molecules.json", "w") as f:
    json.dump(molecules, f, indent=2)

print(f"Fetched {len(molecules)} molecules with logP.")

Fetching page 1...
Fetched 100 molecules with logP.


In [None]:
df = pd.DataFrame(molecules)
df.columns

In [None]:
df = df[['molecule_structures', 'molecule_chembl_id', 'patent_ids']]

In [None]:
# Test API with a known molecule Chembl ID
chembl_id = "CHEMBL6329"
url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
res = requests.get(url)
res.raise_for_status()
data = res.json()
documents = data.get("documents", [])
df = pd.DataFrame(documents)
df.head()

Unnamed: 0,abstract,authors,chembl_release,contact,doc_type,document_chembl_id,doi,doi_chembl,first_page,issue,journal,journal_full_title,last_page,patent_id,pubmed_id,src_id,title,volume,year
0,,,"{'chembl_release': 'CHEMBL_7', 'creation_date'...",,DATASET,CHEMBL1158643,,,,,,,,,,0,Unpublished dataset,,
1,,Clader JW.,"{'chembl_release': 'CHEMBL_1', 'creation_date'...",,PUBLICATION,CHEMBL1139451,10.1021/jm030283g,,1.0,1.0,J Med Chem,Journal of medicinal chemistry.,9.0,,14695813.0,1,The discovery of ezetimibe: a view from outsid...,47.0,2004.0
2,Okadaic acid (OA) is a toxin responsible for d...,"Daranas AH, Fernández JJ, Morales EQ, Norte M,...","{'chembl_release': 'CHEMBL_1', 'creation_date'...",,PUBLICATION,CHEMBL1148466,10.1021/jm034189b,,10.0,1.0,J Med Chem,Journal of medicinal chemistry.,13.0,,14695814.0,1,Self-association of okadaic acid upon complexa...,47.0,2004.0
3,A variety of novel heterocyclic compounds havi...,"Cho H, Murakami K, Nakanishi H, Fujisawa A, Is...","{'chembl_release': 'CHEMBL_1', 'creation_date'...",,PUBLICATION,CHEMBL1139452,10.1021/jm030287l,,101.0,1.0,J Med Chem,Journal of medicinal chemistry.,109.0,,14695824.0,1,Synthesis and structure-activity relationships...,47.0,2004.0
4,The hemoglobin-degrading aspartic proteases pl...,"Ersmark K, Feierberg I, Bjelic S, Hamelink E, ...","{'chembl_release': 'CHEMBL_1', 'creation_date'...",,PUBLICATION,CHEMBL1139453,10.1021/jm030933g,,110.0,1.0,J Med Chem,Journal of medicinal chemistry.,122.0,,14695825.0,1,Potent inhibitors of the Plasmodium falciparum...,47.0,2004.0


#### Get document_id for subset

In [21]:
# Function to fetch documents associated with a molecule
def get_documents_for_molecule(chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("documents", [])

In [None]:
chembl_id = "CHEMBL25"
documents = get_documents_for_molecule(chembl_id)
documents

In [24]:
# Add document IDs to each molecule
def fetch_documents_for_molecules(molecules):
    doc_map = {}
    for mol in molecules:
        chembl_id = mol.get("molecule_chembl_id")
        if chembl_id:
            docs = get_documents_for_molecule(chembl_id)
            doc_ids = [d["document_chembl_id"] for d in docs]
            doc_map[chembl_id] = doc_ids
    return doc_map

In [25]:
doc_map = fetch_documents_for_molecules(molecules)

In [None]:
doc_map

In [27]:
df = pd.DataFrame(molecules)

In [28]:
df["logP"] = df["molecule_properties"].apply(
    lambda x: x.get("alogp") if isinstance(x, dict) else None
)

In [None]:
df["document_ids"] = df["molecule_chembl_id"].map(doc_map)

# Show merged DataFrame
df_merged = df[["molecule_chembl_id", "logP", "document_ids"]]
print(df_merged.head())

#### Get molecules from document_id

In [31]:
def get_molecules_for_document(document_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule_document.json?document_chembl_id={document_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("molecule_documents", [])

In [None]:
# Example: Get molecules associated with a document
document_id = "CHEMBL1158643"
molecule_docs = get_molecules_for_document(document_id)

# Print out the molecule ChEMBL IDs
molecule_ids = [doc["molecule_chembl_id"] for doc in molecule_docs]
print(f"Molecule ChEMBL IDs for document {document_id}: {molecule_ids}")

#### Deprecated

In [17]:
def fetch_molecules_with_logp_smiles_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    # Fetch SMILES and Patent IDs for each molecule
    molecules_with_smiles_and_patents = []
    for molecule in molecules:
        patent_ids = []
        smiles = None
        
        # Get SMILES from molecule_structures
        if 'molecule_structures' in molecule:
            smiles = molecule['molecule_structures'].get('canonical_smiles', None)
        
        # Get patent IDs from molecule_patents
        if 'molecule_patents' in molecule:
            patent_ids = [patent['patent_chembl_id'] for patent in molecule['molecule_patents']]
        
        # Add SMILES and patent IDs to molecule data
        molecule['smiles'] = smiles
        molecule['patent_ids'] = patent_ids
        molecules_with_smiles_and_patents.append(molecule)

    return molecules_with_smiles_and_patents