# Fetch FDA approval annotations

In [None]:
from __future__ import annotations

import json
import re
import requests
import pandas as pd
import typing as t

from bs4 import BeautifulSoup, Tag, NavigableString
from getpass import getpass
from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain
from pathlib import Path
from urllib import parse

In [None]:
gdsc_dir = Path("../../../data/processed/GDSC")
drug_info = pd.read_csv(
    gdsc_dir / "DrugAnnotations.csv",
    dtype={"drug_id": int, "pubchem_id": int},
)
drug_info.head()

In [None]:
BASE_URL = (
    "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/heading/JSON"
)


def get_drugs_at_fda() -> t.Generator[dict, None, None]:
    """Downloads the PubCHEM Drugs@FDA annotations."""
    params = {
        "heading": "FDA Approved Drugs",
        "heading_type": "Compound",
        "page": 1,
        "source": "Drugs@FDA",
    }
    while True:
        resp = requests.get(BASE_URL, params)
        resp.raise_for_status()
        annot = resp.json()["Annotations"]
        params["page"] += 1
        yield annot["Annotation"]

        if params["page"] > annot["TotalPages"]:
            break


def parse_drugs_at_fda(pages: t.Iterable[dict]) -> list[str]:
    """Extracts approved PubCHEM CIDs from Drugs@FDA annotations."""
    pchem_ids = []
    for page in pages:
        for annot in page:
            records = annot.get("LinkedRecords")
            if records is not None:
                linked_ids = records["CID"]
                pchem_ids.extend(linked_ids)
    return sorted(list(set(pchem_ids)))

In [None]:
def get_nci_cancer_drugs() -> t.Generator[dict, None, None]:
    """Downloads the PubCHEM NCI Cancer Drugs annotations."""
    params = {
        "heading": "Cancer Drugs",
        "heading_type": "Compound",
        "page": 1,
        "source": "NCI Cancer Drugs",
    }
    while True:
        resp = requests.get(BASE_URL, params)
        resp.raise_for_status()
        annot = resp.json()["Annotations"]
        params["page"] += 1
        yield annot["Annotation"]

        if params["page"] > annot["TotalPages"]:
            break


def parse_nci_cancer_drugs(pages: t.Iterable[dict]) -> pd.DataFrame:
    """Extracts approved PubCHEM CIDs from Drugs@FDA annotations."""
    res = []
    for page in pages:
        for annot in page:
            records = annot.get("LinkedRecords")
            data = annot.get("Data")
            if records is not None and data is not None:
                linked_ids = records["CID"]
                indications = None
                fda_status = None
                for item in data:
                    if "Name" in item:
                        if item["Name"] == "FDA Approved":
                            fda = item["Value"]["StringWithMarkup"][0][
                                "String"
                            ]
                        elif item["Name"] == "Drug Use":
                            inds = item["Value"]["StringWithMarkup"]
                            inds = [x["String"] for x in inds]
                for id_ in linked_ids:
                    res.append(
                        {
                            "pubchem_id": id_,
                            "NCICD__is_fda_approved": fda == "Yes",
                            "NCICD__fda_indications": inds,
                            "NCICD__url": annot["URL"],
                        }
                    )

    return pd.DataFrame(res)

In [None]:
# add Drugs@FDA annotations
drugs_at_fda_annots = list(get_drugs_at_fda())
drugs_at_fda_pchem_ids = parse_drugs_at_fda(drugs_at_fda_annots)
drug_info["Drugs@FDA__is_fda_approved"] = drug_info["pubchem_id"].isin(
    drugs_at_fda_pchem_ids
)

# add NCI Cancer Drugs annotations
nci_cancer_drugs_annots = list(get_nci_cancer_drugs())
nci_cancer_drugs_annots = parse_nci_cancer_drugs(nci_cancer_drugs_annots)
drug_info = pd.merge(
    drug_info, nci_cancer_drugs_annots, on="pubchem_id", how="left"
)
drug_info = drug_info.fillna({"NCICD__is_fda_approved": False})

drug_info.head()

## END MAIN

In [None]:
template = (
    r"Using the following description, please extract a list of all approved "
    "cancer types for {drug}. The result should be formatted as a list of "
    "primary cancer type indications and, for each primary indication, a list "
    "of cancer subtypes and/or sub-indications. The results should be in CSV "
    "format as a string.\n\n"
    "Description: \"{desc}\""
)

In [None]:
def parse_indication(items: list[str]) -> str:
    parsed = "\n".join(items)
    parsed = parsed.replace(".", ". ")
    return re.sub(" +", " ", parsed).strip()
    # for item in items:
    #     item = item.replace("•", " ")
    #     item = item.replace(":", " ")
    #     item = re.sub("\s+", " ", item)
    #     parsed.append(item)
    # return " ".join(parsed).strip()


drug_to_inds = (
    drug_info[drug_info["NCICD__is_fda_approved"] == True]
    .dropna(subset="NCICD__fda_indications")
    .drop_duplicates(subset="pubchem_id")
    .filter(items=["drug_name", "pubchem_id", "NCICD__fda_indications"])
)
drug_to_inds = dict(zip(temp["drug_name"], temp["NCICD__fda_indications"]))
drug_to_prompt = {
    k: template.format(drug=k, desc=parse_indication(v))
    for k, v in drug_to_inds.items()
}

In [None]:
for drug, prompt in drug_to_prompt.items():
    print(drug)
    print(prompt)
    print()

In [None]:
"\n".join(drug_to_inds["5-Fluorouracil"])

In [None]:
print(drug_to_prompt["5-Fluorouracil"])

OPENAI_API_KEY = getpass()

In [None]:


drug = "Cisplatin"

col = "NCICD__fda_indications"
desc = drug_info[drug_info["drug_name"] == drug].iloc[0][col]
desc = parse_indication(desc)

# prompt = template.format(drug=drug, desc=desc)
prompt = PromptTemplate(template=template, input_variables=["drug", "desc"])

In [None]:
# llm = OpenAI(
#     openai_api_key=
# )
llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain.run(drug=drug, desc=desc)

In [None]:
import openai

openai.api_key = "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"

response = openai.ChatCompletion.create(
    model="gpt-4-0613",
    messages=[
        {"role": "assistant", "content": template.format(drug=drug, desc=desc)}
    ],
)
print(response["choices"][0]["message"]["content"])

In [None]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain.run(drug=drug, desc=desc)


In [None]:
print(template.format(drug=drug, description=desc))

In [None]:
# save results
drug_info.to_csv(
    gdsc_dir / "DrugAnnotationsWithFDAApprovalStatus.csv", index=False
)

## Scraping NCI Cancer Drugs (DEPRECATED)

In [None]:
url = parse.urljoin(base_url, "about-cancer/treatment/drugs")
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")

drug_lists = soup.find("article").select("ul:not(nav ul)")
drug_to_link = {}
for drug_list in drug_lists:
    for a in drug_list.find_all("a", href=True):
        href = str(a["href"])
        if href.startswith("/about-cancer/treatment/drugs"):
            drug_name = a.text
            drug_to_link[drug_name] = parse.urljoin(base_url, href)

In [None]:
def get_fda_status(article: Tag | NavigableString) -> str | None:
    """Extract FDA status from soup."""
    div = article.find("div", string="FDA Approved")
    if div is not None:
        fda_status = list(div.parent.children)[1].text
        return fda_status


def get_daily_mail_link(article: Tag | NavigableString) -> str | None:
    """Extract daily mail link from soup if present."""
    a = article.find("a", {"href": re.compile(r"dailymed\.nlm\.nih\.gov")})
    if a is not None:
        daily_mail_link = a["href"]
        return daily_mail_link


def get_nci_drug_link(article: Tag | NavigableString) -> str | None:
    """Extract NCI drug dictionary link from soup if present."""
    pattern = re.compile(r"publications\/dictionaries\/cancer-drug")
    a = article.find("a", {"href": pattern})
    if a is not None:
        nci_drug_link = a["href"]
        return nci_drug_link
    
def get_cancer_types(article: Tag | NavigableString) -> list[str] | None:
    """Extract approved cancer types from soup."""
    h2 = article.find("h2", string="Use in Cancer")
    if h2 is not None:
        use_in_cancer_ul = h2.parent.find("ul", recursive=False)
        strongs = use_in_cancer_ul.find_all("strong")
        cancer_types = set()
        if strongs is not None:
            for el in strongs:
                a = el.find("a")
                if a is not None:
                    cancer_types.add(a.text)
            return list(cancer_types)

In [None]:
drug_to_soup = {}
for drug_name, drug_link in drug_to_link.items():
    resp = requests.get(drug_link)
    soup = BeautifulSoup(resp.content, "html.parser")
    drug_to_soup[drug_name] = soup

In [None]:
results = []
for drug_name, soup in drug_to_soup.items():
    article = soup.find("article")
    if article is not None:
        fda_status = get_fda_status(article)
        nci_drug_link = get_nci_drug_link(article)
        daily_mail_link = get_daily_mail_link(article)
        cancer_types = get_cancer_types(article)
        if cancer_types is None:
            results.append(
                [
                    drug_name,
                    None,
                    fda_status,
                    daily_mail_link,
                    nci_drug_link,
                ]
            )
        else:
            for cancer_type in cancer_types:
                results.append(
                    [
                        drug_name,
                        cancer_type,
                        fda_status,
                        daily_mail_link,
                        nci_drug_link,
                    ]
                )

In [None]:
results = pd.DataFrame(
    results,
    columns=[
        "drug_name",
        "cancer_type",
        "fda_status",
        "daily_mail_link",
        "nci_drug_dict_link",
    ],
)
results.head()

In [None]:
results.to_csv(out_dir / "FDAApprovedDrugCancerTypeCombos.csv", index=False)

In [None]:
cancer_list = results[["cancer_type"]].drop_duplicates()
cancer_list.to_csv(out_dir / "CancerTypeList.csv", index=False)

In [None]:
drug_list = results[["drug_name", "nci_drug_dict_link"]].drop_duplicates()


In [None]:
link = drug_list["nci_drug_dict_link"][0]
resp = requests.get(link, allow_redirects=True)
soup = BeautifulSoup(resp.content, "html.parser")

In [None]:
drug_list.to_csv(out_dir / "DrugList.csv", index=False)