In [2]:
import requests
import pandas as pd
import json
import os

In [3]:
os.makedirs("../data/raw/", exist_ok=True)
base_url = "https://www.ebi.ac.uk"

#### Get molecules w/ logP

In [25]:
def fetch_molecules_with_logp_and_docs(limit=100, max_pages=None, verbose=True):
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_documents__isnull=false&limit={limit}"
    molecules = []
    page = 0
    
    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta']['next']
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

In [26]:
molecules = fetch_molecules_with_logp_and_docs(limit=50, max_pages=1)

# Save raw data
with open("../data/raw/chembl_logp_molecules.json", "w") as f:
    json.dump(molecules, f, indent=2)

print(f"Fetched {len(molecules)} molecules with logP.")

Fetching page 1...
Fetched 50 molecules with logP.


In [30]:
df = pd.DataFrame(molecules)
df.columns

Index(['atc_classifications', 'availability_type', 'biotherapeutic',
       'cross_references', 'dosed_ingredient', 'first_approval',
       'first_in_class', 'helm_notation', 'indication_class', 'inorganic_flag',
       'max_phase', 'molecule_chembl_id', 'molecule_hierarchy',
       'molecule_properties', 'molecule_structures', 'molecule_synonyms',
       'molecule_type', 'natural_product', 'oral', 'orphan', 'parenteral',
       'polymer_flag', 'pref_name', 'prodrug', 'structure_type',
       'therapeutic_flag', 'topical', 'usan_stem', 'usan_stem_definition',
       'usan_substem', 'usan_year', 'withdrawn_flag'],
      dtype='object')

In [32]:
df = df[['molecule_structures', 'molecule_chembl_id', 'molecule_properties']]
df.head(2)

Unnamed: 0,molecule_structures,molecule_chembl_id,molecule_properties
0,{'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...,CHEMBL6329,"{'alogp': '2.11', 'aromatic_rings': 3, 'cx_log..."
1,{'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...,CHEMBL6328,"{'alogp': '1.33', 'aromatic_rings': 3, 'cx_log..."


In [None]:
# Test API with a known molecule Chembl ID
chembl_id = "CHEMBL6329"
url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
res = requests.get(url)
res.raise_for_status()
data = res.json()
documents = data.get("documents", [])
df = pd.DataFrame(documents)
df.head(1)

In [33]:
# Get all activities in CHemBL 
url = "https://www.ebi.ac.uk/chembl/api/data/activity.json?limit=1000"
res = requests.get(url)
res.raise_for_status()
data = res.json()

types = {a["standard_type"] for a in data["activities"] if a.get("standard_type")}
types = sorted(types)
for t in types:
    print(t)

% remaining
AOC
AUC
Absorption
Active dose
Activity
Analgesia
Average
Average lesion score
BP
Binding affinity
Blood glucose
C50
CAR
CC50
CCh
CLogP
CPA
Cell survived
Change
Change in blood pressure
Cmax
Control
DOSE
DP
Delta CPP
Delta H-V
Delta S-H
Delta TC
Dose
Duration
EC10
EC50
ED50
ED90
Efficacy
Emax
Glycaemia evolution
Growth of cells
HD50
Half duration
IC50
IC80
IC90
ID50
ILS
Increase in CBF
Inhibition
Inhibitory response
Intrinsic activity
K inact/Ki
Kb
Kd
Ki
Ki high
Ki low
Ki ratio
LD50
Log 1/K
Log CFU
Log K'
Log SP
LogD
LogP
Lysis
MED
MFC
MIC
MIC50
MIC90
MP
Max effect
Max stimulation
No. deaths/treated
No. of rats
PI
PT
Permeability rate
Potency ratio
Protection
RBA
Rate constant
Ratio
Recovery
Reduction
Relative activity
Relative affinity
Relative potency
Remaining
Reversal
Reversal type
Saluretic potency
Selective toxicity
Selectivity
Selectivity index
Selectivity ratio
Solubility
Solubility ratio
Symptoms
Synergy
T/C
T1/2
TCS50
TD50
TGI
TP
TPE
Tmax
Toxic dose
Toxicity ratio

#### Get document_id for subset

In [21]:
# Function to fetch documents associated with a molecule
def get_documents_for_molecule(chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/document.json?molecule_chembl_id={chembl_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("documents", [])

In [None]:
chembl_id = "CHEMBL25"
documents = get_documents_for_molecule(chembl_id)
documents

In [24]:
# Add document IDs to each molecule
def fetch_documents_for_molecules(molecules):
    doc_map = {}
    for mol in molecules:
        chembl_id = mol.get("molecule_chembl_id")
        if chembl_id:
            docs = get_documents_for_molecule(chembl_id)
            doc_ids = [d["document_chembl_id"] for d in docs]
            doc_map[chembl_id] = doc_ids
    return doc_map

In [25]:
doc_map = fetch_documents_for_molecules(molecules)

In [None]:
doc_map

In [27]:
df = pd.DataFrame(molecules)

In [28]:
df["logP"] = df["molecule_properties"].apply(
    lambda x: x.get("alogp") if isinstance(x, dict) else None
)

In [None]:
df["document_ids"] = df["molecule_chembl_id"].map(doc_map)

# Show merged DataFrame
df_merged = df[["molecule_chembl_id", "logP", "document_ids"]]
print(df_merged.head())

#### Get molecules from document_id

In [31]:
def get_molecules_for_document(document_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule_document.json?document_chembl_id={document_id}"
    res = requests.get(url)
    res.raise_for_status()
    return res.json().get("molecule_documents", [])

In [None]:
# Example: Get molecules associated with a document
document_id = "CHEMBL1158643"
molecule_docs = get_molecules_for_document(document_id)

# Print out the molecule ChEMBL IDs
molecule_ids = [doc["molecule_chembl_id"] for doc in molecule_docs]
print(f"Molecule ChEMBL IDs for document {document_id}: {molecule_ids}")

#### To test

In [24]:
def fetch_molecules_with_logp_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    return molecules

def fetch_molecules_with_logp_smiles_and_patents(limit=100, max_pages=None, verbose=True):
    base_url = "https://www.ebi.ac.uk"
    url = f"{base_url}/chembl/api/data/molecule.json?logp__isnull=false&molecule_patents__isnull=false&limit={limit}"
    molecules = []
    page = 0

    while url and (max_pages is None or page < max_pages):
        if verbose:
            print(f"Fetching page {page + 1}...")
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        molecules.extend(data['molecules'])

        next_url = data['page_meta'].get('next')
        url = f"{base_url}{next_url}" if next_url else None
        page += 1

    # Fetch SMILES and Patent IDs for each molecule
    molecules_with_smiles_and_patents = []
    for molecule in molecules:
        patent_ids = []
        smiles = None
        
        # Get SMILES from molecule_structures
        if 'molecule_structures' in molecule:
            smiles = molecule['molecule_structures'].get('canonical_smiles', None)
        
        # Get patent IDs from molecule_patents
        if 'molecule_patents' in molecule:
            patent_ids = [patent['patent_chembl_id'] for patent in molecule['molecule_patents']]
        
        # Add SMILES and patent IDs to molecule data
        molecule['smiles'] = smiles
        molecule['patent_ids'] = patent_ids
        molecules_with_smiles_and_patents.append(molecule)

    return molecules_with_smiles_and_patents