# Grounding clinical trials interventions using PubChem's API

Data from https://aact.ctti-clinicaltrials.org

In [None]:
from collections import defaultdict
import json

import pandas as pd

from pubchempy import get_compounds
from tqdm import tqdm

Read clinical trials data (downloaded from https://aact.ctti-clinicaltrials.org/download)

In [None]:
interventions_df = pd.read_csv(
    "../../data/clinical_trials/interventions.txt.gz",
    compression="gzip",
    sep="|",
    usecols=[
        "id",
        "nct_id",
        "intervention_type",
        "name",
    ],
)

In [None]:
interventions_df.head(1)

Load study metadata

In [None]:
studies_df = pd.read_csv(
    "../../data/clinical_trials/studies.txt.gz",
    sep="|",
    usecols=[
        "nct_id",
        "phase",
    ],
    index_col="nct_id",
    dtype="str",
    compression="gzip",
)

In [None]:
studies_df.shape[0]

Mapping clinical trials ids to phase

In [None]:
studies_df = studies_df[studies_df["phase"] != "Not Applicable"]
# Remove NaNs in phase
studies_df = studies_df[studies_df["phase"].notna()]

In [None]:
studies_df.shape[0]

In [None]:
# Map nct_id (index) to phase witha dict
studies_df = studies_df["phase"].to_dict()

In [None]:
# Map name to nct_id with a dict
intervention_to_latest_phase = defaultdict(set)

skipped = 0

for _, nct_id, _, intervention_name in interventions_df.values:
    if nct_id not in studies_df:
        skipped += 1
        continue

    intervention_to_latest_phase[intervention_name].add(int(studies_df[nct_id][-1]))

Count interventions that have been used in at least phase I and the ones that are not

In [None]:
len(intervention_to_latest_phase), skipped

Grounding intervention names to PubChem ids

In [None]:
def is_valid(name):
    if pd.isna(name):
        return False

    if "placebo" in name.lower():
        return False

    if (
        # Antibodies
        "mab" in name.lower()
        or "globulin" in name.lower()
        or "saline infusion" in name.lower()
        or "vehicle" in name.lower()
        or "cells" in name.lower()
        or "control" in name.lower()
    ):
        return False

    return True

The following code below is meant to populate the jsons that are already loaded by searching every intervention name in the clinical trials data and mapping it to a PubChem id. This is a very slow process and it is not necessary to run it again. The jsons are already populated.

In [None]:
drug_to_pubchems = defaultdict(list)

skipped = set()

# filter df to have only drugs
interventions_df = interventions_df[interventions_df["intervention_type"] == "Drug"]

# filter columns that has with placebo or control in the name
interventions_df = interventions_df[
    ~interventions_df["intervention_type"].str.lower().str.contains("placebo|control")
]

# Make a set with all the intervention names
intervention_names = set(interventions_df["name"].values)

for intervention in tqdm(intervention_names, total=len(intervention_names)):
    if not isinstance(intervention, str):
        continue

    if not is_valid(intervention):
        continue

    compounds = get_compounds(intervention, namespace="name")

    if not compounds:
        skipped.add(intervention)
        continue

    drug_to_pubchems[intervention].append(
        {
            "cid": compounds[0].cid,
            "smiles": compounds[0].canonical_smiles,
        }
    )

    # export the dict as a json every 1000 iterations
    if len(drug_to_pubchems) % 100 == 0:
        with open("drug_to_pubchems.json", "w") as f:
            json.dump(drug_to_pubchems, f, indent=4)

    if len(skipped) % 100 == 0:
        # print what's the percentage of skipped drugs
        # print(f"Skipped {len(skipped) / * 100:.2f}% of rows have been skipped.")

        # export the skipped drugs as a json
        with open("skipped.json", "w") as f:
            json.dump(list(skipped), f, indent=4)