In [1]:
import lxml.etree as ET
import os
import requests
import urllib.parse
import requests
from bs4 import BeautifulSoup
import csv
import sys
import json
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch

csv.field_size_limit(sys.maxsize) # needed to read large csv files

# Paths to local XML and DTD files
DTD_FILE = "input/mplus_topics_2012-06-01.dtd"
XML_FILE = "input/mplus_topics_2025-03-04.xml" 

# Path to local metathesaurus files
MRCONSO = "umls_metathesaurus/MRCONSO.RRF"
MRSTY = "umls_metathesaurus/MRSTY.RRF"


# Load BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Example text
text = "COVID-19 was discovered in Wuhan, China."
print(ner_pipeline(text))

Device set to use mps:0
The model 'BertForSequenceClassification' is not supported for ner. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'BrosForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DiffLlamaForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'ErnieForTokenClassification', 'ErnieMForTokenClassification', 'EsmForTokenClassification', 'FalconForTokenClassification', 'FlaubertForTokenClassification', 'FNetForTokenClassification', 'FunnelForTokenClassification', 'GemmaForTokenClassification', 'Gemma2ForTokenClassification', 'GlmForTokenClassification', 'GPT2ForTokenClassification', 'GPT2ForTokenClassification', 'GPTBigCodeForT

IndexError: invalid index to scalar variable.

In [5]:
import xml.etree.ElementTree as ET

def filter_sites_if_included_in_string(diseas_string):
    """
    Parses the given MedlinePlus health topics XML file and returns all <site> elements (as dicts)
    whose 'title' attribute contains the diseas_string string (case-sensitive) and which match:
      1) The site title includes the provided diseas_string parameter.
      2) The site contains both <information-category>Patient Handouts</information-category> 
         and <information-category>Encyclopedia</information-category>.
      3) The site has an <organization> element with the text 'Medical Encyclopedia'.
    
    The results are sorted alphabetically by the site's title.
    """
    tree = ET.parse(XML_FILE)
    root = tree.getroot()  # The root should be <health-topics>
    
    all_matching_sites = []
    
    # Iterate over all <health-topic> elements.
    for health_topic in root.findall("health-topic"):
        # Each <health-topic> may have multiple <site> children.
        for site_elem in health_topic.findall("site"):
            # Get the site's title attribute.
            current_site_title = site_elem.get("title", "").strip()
            # Check if the provided site_title parameter is a substring of current_site_title.
            if current_site_title.lower() not in diseas_string.lower():
                continue  # Skip if the parameter is not found in the site's title.
            
            # Gather all <information-category> texts.
            info_categories = [
                ic.text.strip() for ic in site_elem.findall("information-category") if ic.text
            ]
            
            # Get the text for the <organization> element.
            org_elem = site_elem.find("organization")
            org_text = org_elem.text.strip() if org_elem is not None and org_elem.text else ""
            
            # Check conditions:
            #   a) Must have 'Patient Handouts' and 'Encyclopedia' among the information categories.
            #   b) Must have <organization> exactly equal to 'Medical Encyclopedia'.
            if ("Patient Handouts" in info_categories and
                "Encyclopedia" in info_categories and
                org_text == "Medical Encyclopedia"):
                
                site_url = site_elem.get("url", "").strip()
                lmapped_url = site_elem.get("language-mapped-url", "").strip()
                
                all_matching_sites.append({
                    "title": current_site_title,
                    "url": site_url,
                    "language_mapped_url": lmapped_url,
                    "information_categories": info_categories,
                    "organization": org_text
                })
    
    # Sort results alphabetically by title (case-insensitive).
    all_matching_sites.sort(key=lambda s: s["title"].lower())
    
    return all_matching_sites

In [6]:
 
def filter_sites_by_title(site_title):
    """
    Parses the given MedlinePlus health topics XML file,
    and returns all <site> elements (as dicts) that match:
      1) <site title=site_title> (exact attribute match)
      2) Contain <information-category>Patient Handouts</information-category>
         AND <information-category>Encyclopedia</information-category>
      3) <organization>Medical Encyclopedia</organization>

    The results are sorted alphabetically by <site> title.
    """
    tree = ET.parse(XML_FILE)
    root = tree.getroot()  # <health-topics> is the root

    all_matching_sites = []

    # Iterate over all <health-topic> elements
    for health_topic in root.findall("health-topic"):
        # Each <health-topic> may have multiple <site> children
        for site_elem in health_topic.findall("site"):
            # Check if this <site> has the exact title attribute
            current_site_title = site_elem.get("title", "").strip()
            if current_site_title != site_title:
                continue  # Skip if the title doesn't match

            # Gather <information-category> texts
            info_categories = [
                ic.text.strip() for ic in site_elem.findall("information-category") if ic.text
            ]

            # Check for <organization>
            org_elem = site_elem.find("organization")
            org_text = org_elem.text.strip() if org_elem is not None and org_elem.text else ""

            # Condition:
            #   a) Must have 'Patient Handouts' and 'Encyclopedia' among info_categories
            #   b) Must have <organization> == 'Medical Encyclopedia'
            #   c) Already matched the site title above
            if ("Patient Handouts" in info_categories and
                "Encyclopedia" in info_categories and
                org_text == "Medical Encyclopedia"):

                # This site matches the desired criteria
                site_url = site_elem.get("url", "").strip()
                lmapped_url = site_elem.get("language-mapped-url", "").strip()

                all_matching_sites.append({
                    "title": current_site_title,
                    "url": site_url,
                    "language_mapped_url": lmapped_url,
                    "information_categories": info_categories,
                    "organization": org_text
                })

    # Sort results by title (case-insensitive)
    all_matching_sites.sort(key=lambda s: s["title"].lower())

    return all_matching_sites

In [7]:
def extraxct_html(site, title_query, cui, download=False, folder="html"):
    """
    Downloads the HTML from a given URL and saves it as a .html file
    in the specified 'folder' (default: 'html').

    Args:
        url (str): The URL to fetch.
        folder (str): The folder where the .html file will be saved.

    Returns:
        str: The full file path of the saved HTML file.
    """
    # save each CUI in a separate folder
    folder_cui = os.path.join(folder, cui)
    os.makedirs(folder_cui, exist_ok=True)

    # URL-encode the entire URL to make a valid filename, then append .html
    #safe_filename = urllib.parse.quote_plus(url) + ".html"
    file_path = os.path.join(folder_cui, site["title"] + ".html")

    # Send the GET request
    response = requests.get(site["url"])
    response.raise_for_status()  # Raise an error if request failed

    # Write the response text (HTML content) to a file
    if download:
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(response.text)

    return file_path, response.text

In [8]:
def get_diseases_from_umls():
    """
    Reads UMLS MRSTY.RRF and MRCONSO.RRF files in /umls_metathesaurus to extract diseases (excluding symptoms)
    and returns a dictionary where keys are CUIs and values are the preferred English names.
    """
    # Build a mapping of each CUI to its set of semantic types
    cui_semtypes = {}
    with open(MRSTY, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='|')
        for row in reader:
            if len(row) > 3:
                cui = row[0].strip()
                semtype = row[3].strip()
                cui_semtypes.setdefault(cui, set()).add(semtype)
    
    # Create a set of CUIs that have "Disease or Syndrome" and do NOT have "Sign or Symptom"
    disease_cuis = {cui for cui, types in cui_semtypes.items()
                    if 'Disease or Syndrome' in types and 'Sign or Symptom' not in types}
    
    # Now, map each filtered CUI to its preferred English name using MRCONSO.RRF.
    diseases = {}
    with open(MRCONSO, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='|')
        for row in reader:
            # MRCONSO.RRF columns:
            # 0: CUI, 1: LAT (language), 2: TS, 3: LUI, 4: STT, 5: SUI, 6: ISPREF,
            # 7: AUI, 8: SAUI, 9: SCUI, 10: SDUI, 11: SAB, 12: TTY, 13: CODE,
            # 14: STR (the term), 15: SRL, 16: SUPPRESS, 17: CVF
            if row[0].strip() in disease_cuis and row[1].strip() == 'ENG' and row[6].strip() == 'Y':
                diseases[row[0].strip()] = row[14].strip()
    
    return diseases

In [9]:
import csv

def get_signs_symptoms_from_umls():
    """
    Reads UMLS MRSTY.RRF and MRCONSO.RRF files and extracts all concepts 
    with the semantic type "Sign or Symptom". It returns a dictionary 
    mapping each CUI to its preferred English name.
        
    Returns:
        dict: A dictionary where keys are CUIs and values are the preferred English names.
    """
    # First, build a set of CUIs that are assigned the semantic type "Sign or Symptom"
    sign_symptom_cuis = set()
    with open(MRSTY, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='|')
        for row in reader:
            # In MRSTY.RRF, the 4th column (index 3) is the semantic type name.
            if len(row) > 3 and row[3].strip() == 'Sign or Symptom':
                sign_symptom_cuis.add(row[0].strip())
    
    # Next, read MRCONSO.RRF to map each of those CUIs to its preferred English name.
    signs_symptoms = {}
    with open(MRCONSO, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='|')
        for row in reader:
            # MRCONSO.RRF columns:
            # 0: CUI, 1: LAT (language), 2: TS, 3: LUI, 4: STT, 5: SUI, 6: ISPREF,
            # 7: AUI, 8: SAUI, 9: SCUI, 10: SDUI, 11: SAB, 12: TTY, 13: CODE,
            # 14: STR (the term), 15: SRL, 16: SUPPRESS, 17: CVF
            if (row[0].strip() in sign_symptom_cuis and 
                row[1].strip() == 'ENG' and 
                row[6].strip() == 'Y'):
                signs_symptoms[row[0].strip()] = row[14].strip()
    
    return signs_symptoms

In [10]:
diseas_dict = get_diseases_from_umls()


In [11]:
symptoms_dict = get_signs_symptoms_from_umls()

In [12]:
""" for key, value in diseas_dict.items():
    print(key, value) """
    

' for key, value in diseas_dict.items():\n    print(key, value) '

In [13]:
def is_symptom_match(bullet, umls_name):
    """
    Returns True if the token set of one string is a subset of the token set of the other.
    This allows a symptom like "Fewer" to match "High Fewer" while preventing a partial match
    like "age" in "blockage".
    """
    # Normalize and tokenize both strings
    bullet_tokens = re.findall(r'\w+', bullet.lower())
    umls_tokens = re.findall(r'\w+', umls_name.lower())
    
    # Convert lists to sets for easy subset checking
    bullet_set = set(bullet_tokens)
    umls_set = set(umls_tokens)
    
    # If all tokens in one are contained in the other, consider it a match.
    return bullet_set.issubset(umls_set) or umls_set.issubset(bullet_set)

In [14]:

def extract_symptoms_bullets(html_text):
    """
    Given an HTML string of a MedlinePlus page (or similar),
    find the section whose <h2> text == 'Symptoms' and
    return all the bullet-point text <li> from that section.

    Returns:
        A list of strings, where each string is the text of a bullet point.
    """

    soup = BeautifulSoup(html_text, "html.parser")
    
    # 1. Find the section that has <h2>Symptoms</h2>
    #    Each <section> typically has structure:
    #    <section><div class="section"><div class="section-header">...
    #    We'll locate the <h2> with text "Symptoms" (case-insensitive match).
    
    # Approach: Look for an <h2> whose .text == "Symptoms" (stripping/case-insensitive).
    target_section = None
    all_h2 = soup.find_all("h2")
    for h2_tag in all_h2:
        if h2_tag.get_text(strip=True).lower() == "symptoms":
            # The <h2> is inside <div class="section-title">, which is inside <div class="section">
            # We'll go up to the nearest <section> or parent that encloses this heading
            # In the snippet, it's two levels up <section><div class="section"><div class="section-title"><h2>Symptoms</h2></div>
            # but let's be flexible in case of slight structure changes.
            # We'll climb up to the <section> ancestor:
            parent_section = h2_tag.find_parent("section")
            if parent_section:
                target_section = parent_section
                break

    if not target_section:
        print("No 'Symptoms' section found.")
        return []  # No "Symptoms" section found
    
    # 2. Extract bullet points from the section.
    bullet_points = []
    for li_tag in target_section.find_all("li"):
        text = li_tag.get_text(" ", strip=True)
        bullet_points.append(text)
    
    # 3. For each bullet, attempt to match against the UMLS symptoms dictionary.
    results = []
    for bullet in bullet_points:
        bullet_lower = bullet.lower()
        matched_cuis = []
        
        # Iterate over the UMLS dictionary: keys = CUI, values = UMLS name.
        for cui, umls_name in symptoms_dict.items():
            # Check bidirectionally for substring match.
            if is_symptom_match(bullet, umls_name):
                matched_cuis.append({
                    "CUI": cui,
                    "UMLS_name": umls_name
                })
        
        # If no matches were found, assign "NONE"
        if not matched_cuis:
            matched_cuis = ["NONE"]
        
        results.append({
            "symptom": bullet,
            "matched_umls": matched_cuis
        })
    return results

In [None]:
def recognize_entities(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    propabilities = torch.softmax(logits, dim=1).tolist()[0]


In [None]:
for cui, diseas in diseas_dict.items():
    matching_sites = filter_sites_if_included_in_string(diseas)

    # Print the results
    if matching_sites:
        print(f"Found {len(matching_sites)} matching site(s) with diseas='{diseas}':\n")
        for site in matching_sites:
            file_path, html_content = extraxct_html(site,diseas,cui, download=False)
            print(f"- Title: {site['title']}")
            print(f"  URL: {site['url']}")
            print(f"  Language-Mapped URL: {site['language_mapped_url']}")
            print(f"  Categories: {site['information_categories']}")
            print(f"  Organization: {site['organization']}")
            print("--------------------------------------------------")
            # If extract_html(..., download=True) was used, the HTML content can be loaded with the code below
            #with open(file_path, "r", encoding="utf-8") as file:
            #    html_content = file.read()
                
            # Call our function to extract symptoms from the HTML content
            symptoms_list = extract_symptoms_bullets(html_content)
            print(f"Extracted symptoms from the 'Symptoms' section in {file_path}:")
            print("--------------------------------------------------")
            # Prepare the output dictionary
            output = {
                "CUI": cui,
                "UMLS Disease Name": diseas,
                "Site Title": site["title"],
                "Site URL": site["url"],
                "Symptoms": symptoms_list
            }

            # Save the output to a JSON file
            with open("output/diseases_and_symptoms_v2.json", "a", encoding="utf-8") as json_file:
                json.dump(output, json_file, ensure_ascii=False, indent=4)
                json_file.write("\n")
    else:
        print(f"No <site> elements found for Diseas='{diseas}'.")

No <site> elements found for Diseas='abetalipoproteinemia (diagnosis)'.
No <site> elements found for Diseas='Abnormality of secretion of gastrin (disorder)'.
Found 1 matching site(s) with diseas='Recurrent miscarriage (disorder)':

- Title: Miscarriage
  URL: https://medlineplus.gov/ency/article/001488.htm
  Language-Mapped URL: https://medlineplus.gov/spanish/ency/article/001488.htm
  Categories: ['Patient Handouts', 'Encyclopedia']
  Organization: Medical Encyclopedia
--------------------------------------------------
Extracted symptoms from the 'Symptoms' section in html/C0000809/Miscarriage.html:
--------------------------------------------------
No <site> elements found for Diseas='missed abortion (history)'.
No <site> elements found for Diseas='ABORTION VET'.
Found 1 matching site(s) with diseas='abscess (diagnosis)':

- Title: Abscess
  URL: https://medlineplus.gov/ency/article/001353.htm
  Language-Mapped URL: https://medlineplus.gov/spanish/ency/article/001353.htm
  Categories

ConnectionError: HTTPSConnectionPool(host='medlineplus.gov', port=443): Max retries exceeded with url: /ency/article/000593.htm (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x337efdfd0>: Failed to resolve 'medlineplus.gov' ([Errno 8] nodename nor servname provided, or not known)"))

In [None]:
""" import glob
import json

# Iterate through all HTML files in the 'html' folder
for html_file in glob.glob("html/*.html"):
    with open(html_file, "r", encoding="utf-8") as file:
        html_content = file.read()
    # Call our function
    symptoms_list = extract_symptoms_bullets(html_content)

    print(f"Extracted bullet points from the 'Symptoms' section in {html_file}:")
    for bp in symptoms_list:
        print("-", bp)
    print("--------------------------------------------------")
    output = {title_query: symptoms_list}

    # Save the output to a JSON file
    with open("output/diseases_and_symptoms.json", "w", encoding="utf-8") as json_file:
        json.dump(output, json_file, ensure_ascii=False, indent=4)
 """

Extracted bullet points from the 'Symptoms' section in html/Diabetes.html:
- Blurry vision
- Excess thirst
- Fatigue
- Frequent urination
- Hunger
- Weight loss
- Eye problems , including trouble seeing (especially at night), light sensitivity, cataracts, and blindness
- Sores and infections of the leg or foot, which if untreated, can lead to amputation of the leg or foot
- Damage to nerves in the body , causing pain, tingling, a loss of feeling, problems digesting food, and erectile dysfunction
- Kidney problems , which can lead to kidney failure
- Weakened immune system, which can lead to more frequent infections
- Increased chance of having a heart attack or stroke
--------------------------------------------------
