In [15]:
# Optionally install dependencies
# !pip install -r ../requirements.txt
# !pip freeze >> ../../requirements.txt

# Imports and constants
from typing import Any
import sys
import os
import warnings

from time import sleep
from tqdm.notebook import tqdm

import numpy as np
import polars as pl

from chembl_webresource_client.new_client import new_client as chembl_client

sys.path.append(os.path.abspath(".."))

from utils.secrets import DATA_PATH

warnings.filterwarnings("ignore", category=UserWarning)

os.environ['VERBOSE'] = 'True'
API_DELAY_SECONDS = 0.3

In [16]:
# Auxiliary Functions
def vprint(*args: Any, **kwargs: Any) -> None:
    """
    Conditionally prints output only if the 'VERBOSE' environmental
    variable is set to a truthy value (e.g., 'True', '1', 'YES').

    It accepts all the standard arguments of the built-in print() function.

    Args:
        *args: Positional arguments to be printed.
        **kwargs: Keyword arguments to be passed to the print() function.
    """
    verbose_value = os.environ.get("VERBOSE", "False").lower()

    if verbose_value in ("true", "1", "t", "y", "yes"):
        print(*args, **kwargs)

In [17]:
# Review all available files
# for file in os.listdir(os.path.join(DATA_PATH, "datdaf20271013")):
#     print(f"Reading file: {file}")
#     try:
#         df = pl.read_csv(
#             os.path.join(DATA_PATH, "datdaf20271013", file),
#             separator="\t",
#             encoding="latin-1",
#             ignore_errors=True,
#             null_values=["00000", "00000 "],
#         )
#         display(df.head())
#     except Exception as e:
#         print(f"Error reading {file}: {e}")

### Main Content

* **Action Types**: lookup table of application actions (e.g. original application, review, etc...);
* **Submissions**: details on the applications, matching IDs to status, submission types and dates;
* **Products**: describes the drugs, with names and active ingredients;
* **Applications**: contains types of applications and the company which submitted it;
* **Marketing Status**: table and lookup for the market status of the drugs (e.g. over the counter, discontinued, etc...).

In [25]:
products_df = pl.read_csv(
    os.path.join(DATA_PATH, "datdaf20271013", "Products.txt"),
    separator="\t",
    ignore_errors=True,
    null_values=["00000", "00000 "],
)
products_df = products_df.with_columns([
    pl.col("ActiveIngredient").str.strip_chars().str.to_lowercase().alias("active_ingredient"),
    pl.col("DrugName").str.strip_chars().str.to_lowercase().alias("drug_name"),
])

#TODO: split multiple active ingredients later (commas and semicolons) for now drop them
# active_ingredients = active_ingredients.filter(~active_ingredients.str.contains(",|;"))

In [46]:
# Acces ChEMBL to get details on active ingredients and their targets
molecule_api = chembl_client.molecule
activity_api = chembl_client.activity
target_api = chembl_client.target
mechanism_api = chembl_client.mechanism

# TODO: how do I select only for the intended target?
# TODO: get target annotations (e.g. type of protein)

def query_drug(drug_name: str) -> pl.DataFrame | None:
    """Queries ChEMBL for a given drug name and retrieves its details and targets.

    Args:
        drug_name (str): The name of the drug to query. 
    Returns:
        pl.DataFrame: A Polars DataFrame containing the drug details and its associated targets.
    """
    vprint(f"Processing drug: {drug_name}")

    # Search for the molecule by synonym and only take the first result
    molecules = molecule_api.search(drug_name).only(
        [
            "molecule_chembl_id",
            "pref_name",
            "molecule_structures",
            "max_phase",
            "standard_inchi_key",
        ]
    )
    if not molecules:
        vprint(f"No molecule found for {drug_name}.")
        return None

    molecules = pl.DataFrame(
        {
            "drug_name": drug_name,
            "molecule_chembl_id": molecules[0]["molecule_chembl_id"],
            "max_phase": molecules[0]["max_phase"],
            "canonical_smiles": molecules[0]["molecule_structures"]["canonical_smiles"],
            "standard_inchi_key": molecules[0]["molecule_structures"]["standard_inchi_key"],
        }
    )

    # Get mechanism of action data
    mechanisms = mechanism_api.filter(
        molecule_chembl_id=molecules["molecule_chembl_id"].item()
    ).only([
        "molecule_chembl_id",
        "target_chembl_id",
        "mechanism_of_action",
        "action_type",
    ])

    if not mechanisms:
        vprint(f"No mechanism of action found for {drug_name}, falling back to activities.")

        # Query for bioactivities (targets) associated with this ChEMBL ID
        # Only take the top 20 activities for human targets with pChEMBL values
        # assumes these are the most relevant
        activities = (
            activity_api.filter(
                molecule_chembl_id=molecules["molecule_chembl_id"].item(),
                target_organism="Homo sapiens",
                assay_type__in=["B"],  # Binding  #F or Functional assays
                pchembl_value__isnull=False,
                standard_type__in=["IC50", "Ki", "EC50"] #, "Potency"]
            )
            .order_by("-pchembl_value")
            .only(
                [
                    "molecule_chembl_id",
                    "target_chembl_id",
                    "standard_type",
                    "standard_value",
                    "standard_units",
                    "pchembl_value",
                    "target_type",
                    "assay_type",
                    "assay_description" 
                ]
            )
        )[:5]

        if not activities:
            vprint(f"No target data found for {drug_name}")
            return None

        # activities = pl.DataFrame([dict(act) for act in activities])
        mechanisms_df = pl.DataFrame([{
            "molecule_chembl_id": act["molecule_chembl_id"],
            "target_chembl_id": act["target_chembl_id"],
            "mechanism_of_action": f"Binding ({act['standard_type']})",
            "action_type": "BINDER",
        } for act in activities])
    else:
        mechanisms_df = pl.DataFrame([dict(mech) for mech in mechanisms])
    
    # Get the human-readable target name
    targets = target_api.filter(
        target_chembl_id__in=mechanisms_df["target_chembl_id"].unique().to_list(),
        terapeutic_flag=True,
    ).only(["target_chembl_id", "pref_name", "target_components", "target_type", "therapeutic_flag"])
    targets = pl.DataFrame([dict(trg) for trg in targets])
    targets = (
        targets.explode("target_components")
        .with_columns(
            [
                pl.col("target_components")
                .struct.field("component_description")
                .alias("component_name"),
                pl.col("target_components")
                .struct.field("component_type")
                .str.to_lowercase()
                .alias("component_type"),
                pl.col("target_components")
                .struct.field("accession")
                .alias("uniprot_id"),
            ]
        )
        .drop("target_components")
    )

    merged = mechanisms_df.join(
        targets,
        left_on="target_chembl_id",
        right_on="target_chembl_id",
        how="left",
    )
    merged = merged.join(
        molecules,
        left_on="molecule_chembl_id",
        right_on="molecule_chembl_id",
        how="left",
    )

    return merged


def query_multiple_drugs(drug_names: list[str]) -> pl.DataFrame:
    """
    Processes a list of drug names, applying a rate limit between calls
    to comply with API usage policies.

    Args:
        drug_names (list[str]): List of drug names to query.
    Returns:
        pl.DataFrame: Combined Polars DataFrame with results from all drugs.
    """
    all_results = []
    
    for drug_name in tqdm(drug_names, total=len(drug_names), desc="Querying drugs", unit="drug", leave=False):
        try:
            result = query_drug(drug_name)
            if result is not None:
                all_results.append(result)
            
            sleep(API_DELAY_SECONDS) 

        except Exception as e:
            print(f"ERROR processing {drug_name}: {e}")
            sleep(5) 
            continue

    if not all_results:
        print("No results were successfully fetched.")
        return pl.DataFrame()

    return pl.concat(all_results)

In [47]:
test_drug_names = ["imatinib", "aspirin", "metformin", "lisinopril", "atorvastatin"]
drug_table = query_multiple_drugs(test_drug_names)
# drug_table = query_multiple_drugs(products_df['drug_name'].unique().to_list())

Querying drugs:   0%|          | 0/5 [00:00<?, ?drug/s]

Processing drug: imatinib
No mechanism of action found for imatinib, falling back to activities.
Processing drug: aspirin
Processing drug: metformin
No mechanism of action found for metformin, falling back to activities.
Processing drug: lisinopril
Processing drug: atorvastatin
No mechanism of action found for atorvastatin, falling back to activities.


ShapeError: unable to vstack, column names don't match: "molecule_chembl_id" and "action_type"

In [21]:
active_ingredients

active_ingredient
str
"""tioconazole"""
"""etoposide"""
"""darbepoetin alfa"""
"""pretomanid"""
"""promazine hydrochloride"""
…
"""thiethylperazine malate"""
"""eravacycline dihydrochloride"""
"""duvelisib"""
"""omeprazole"""
