# Fetch FDA approval annotations

In [None]:
from __future__ import annotations

import re
import requests
import pandas as pd

from bs4 import BeautifulSoup, Tag, NavigableString
from pathlib import Path
from urllib import parse

## Old

In [None]:
base_url = "https://www.cancer.gov"

In [None]:
url = parse.urljoin(base_url, "/about-cancer/treatment/drugs/cancer-type")

response = requests.get(url.format("/about-cancer/treatment/drugs/cancer-type"))
soup = BeautifulSoup(response.content, "html.parser")

cancer_links = []
article = soup.find("article")
for link in article.find_all("a", href=True):
    href = link["href"]
    if href.startswith("/about-cancer/treatment/drugs/"):
        cancer_links.append(link)

In [None]:
results = []
for cancer_link in cancer_links:
    cancer_name = cancer_link.text
    cancer_id = cancer_link["href"].split("/")[-1]

    response = requests.get(parse.urljoin(base_url, cancer_link["href"]))
    soup = BeautifulSoup(response.content, "html.parser")

    article = soup.find("article")
    lists = article.select("ul:not(nav ul)")

    for lst in lists:
        h2 = lst.find_previous("h2")
        for link in lst.find_all("a", href=True):
            href = link["href"]
            if href.startswith("/about-cancer/treatment/drugs"):
                drug_id = href.split("/")[-1]
                drug_name = link.text
                results.append(
                    [cancer_name, drug_name, h2.text, cancer_id, drug_id]
                )

In [None]:
results_df = pd.DataFrame(
    results,
    columns=[
        "cancer_name",
        "drug_name",
        "indication",
        "cancer_link_str",
        "drug_link_str",
    ],
)

results_df["indication"] = results_df["indication"].str.strip("\n")
results_df["cancer_name"] = results_df["cancer_name"].str.replace(
    "Drugs Approved for ", ""
)

results_df = results_df.sort_values(["cancer_name", "indication", "drug_name"])

results_df.head()

In [None]:
results_df["indication"][
    results_df["indication"].str.startswith("Drugs Approved to Prevent")
] = "prevention"

results_df["indication"][
    results_df["indication"].str.startswith("Drug Combinations")
] = "combination"

results_df["indication"][
    (results_df["indication"].str.startswith("Drugs Approved for"))
    | (results_df["indication"].str.startswith("Drugs Approved to Treat"))
] = "treatment"

In [None]:
out_dir = Path("../../../data/raw/NCICancerGov")
out_dir.mkdir(exist_ok=True)

results_df.to_csv(out_dir / "FDAApprovedDrugsByCancerType.csv", index=False)

In [None]:
cancer_types = results_df[["cancer_name", "cancer_link_str"]].drop_duplicates()
cancer_types.to_csv(out_dir / "CancerTypes.csv", index=False)

In [None]:
drug_list = results_df[["drug_name", "drug_link_str"]].drop_duplicates()
drug_list.to_csv(out_dir / "DrugList.csv", index=False)

## Current

In [None]:
url = parse.urljoin(base_url, "about-cancer/treatment/drugs")
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")

drug_lists = soup.find("article").select("ul:not(nav ul)")
drug_to_link = {}
for drug_list in drug_lists:
    for a in drug_list.find_all("a", href=True):
        href = str(a["href"])
        if href.startswith("/about-cancer/treatment/drugs"):
            drug_name = a.text
            drug_to_link[drug_name] = parse.urljoin(base_url, href)

In [None]:
def get_fda_status(article: Tag | NavigableString) -> str | None:
    """Extract FDA status from soup."""
    div = article.find("div", string="FDA Approved")
    if div is not None:
        fda_status = list(div.parent.children)[1].text
        return fda_status


def get_daily_mail_link(article: Tag | NavigableString) -> str | None:
    """Extract daily mail link from soup if present."""
    a = article.find("a", {"href": re.compile(r"dailymed\.nlm\.nih\.gov")})
    if a is not None:
        daily_mail_link = a["href"]
        return daily_mail_link


def get_nci_drug_link(article: Tag | NavigableString) -> str | None:
    """Extract NCI drug dictionary link from soup if present."""
    pattern = re.compile(r"publications\/dictionaries\/cancer-drug")
    a = article.find("a", {"href": pattern})
    if a is not None:
        nci_drug_link = a["href"]
        return nci_drug_link
    
def get_cancer_types(article: Tag | NavigableString) -> list[str] | None:
    """Extract approved cancer types from soup."""
    h2 = article.find("h2", string="Use in Cancer")
    if h2 is not None:
        use_in_cancer_ul = h2.parent.find("ul", recursive=False)
        strongs = use_in_cancer_ul.find_all("strong")
        cancer_types = set()
        if strongs is not None:
            for el in strongs:
                a = el.find("a")
                if a is not None:
                    cancer_types.add(a.text)
            return list(cancer_types)

In [None]:
drug_to_soup = {}
for drug_name, drug_link in drug_to_link.items():
    resp = requests.get(drug_link)
    soup = BeautifulSoup(resp.content, "html.parser")
    drug_to_soup[drug_name] = soup

In [None]:
results = []
for drug_name, soup in drug_to_soup.items():
    article = soup.find("article")
    if article is not None:
        fda_status = get_fda_status(article)
        nci_drug_link = get_nci_drug_link(article)
        daily_mail_link = get_daily_mail_link(article)
        cancer_types = get_cancer_types(article)
        if cancer_types is None:
            results.append(
                [
                    drug_name,
                    None,
                    fda_status,
                    daily_mail_link,
                    nci_drug_link,
                ]
            )
        else:
            for cancer_type in cancer_types:
                results.append(
                    [
                        drug_name,
                        cancer_type,
                        fda_status,
                        daily_mail_link,
                        nci_drug_link,
                    ]
                )

In [None]:
results = pd.DataFrame(
    results,
    columns=[
        "drug_name",
        "cancer_type",
        "fda_status",
        "daily_mail_link",
        "nci_drug_dict_link",
    ],
)
results.head()

In [None]:
results.to_csv(out_dir / "FDAApprovedDrugCancerTypeCombos.csv", index=False)

In [None]:
cancer_list = results[["cancer_type"]].drop_duplicates()
cancer_list.to_csv(out_dir / "CancerTypeList.csv", index=False)

In [None]:
drug_list = results[["drug_name", "nci_drug_dict_link"]].drop_duplicates()


In [None]:
link = drug_list["nci_drug_dict_link"][0]
resp = requests.get(link, allow_redirects=True)
soup = BeautifulSoup(resp.content, "html.parser")

In [None]:
drug_list.to_csv(out_dir / "DrugList.csv", index=False)