In [3]:
import os
import json
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import fitz  
import requests
from urllib.parse import urlparse

In [5]:
RAW_DIR = Path("../docs/kb_raw")
PROCESSED_DIR = Path("../docs/kb_processed")
CHUNKS_DIR = Path("../docs/kb_chunks")

In [7]:
def extract_text_from_pdf(filepath):
    doc = fitz.open(filepath)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def clean_text(text):
    lines = text.splitlines()
    lines = [line.strip() for line in lines if line.strip()]
    return "\n".join(lines)

In [9]:
def sanitize_filename(title, source):
    """Clean file names to avoid illegal characters."""
    return f"{title.lower().replace(' ', '_')}_{source.lower()}.html"

def download_html_pages(url_list, titles_sources):
    """
    Downloads a list of URLs into the 'kb/raw' folder with clean filenames.
    :param url_list: List of URLs to download
    :param titles_sources: List of (title, source) tuples (must match url_list)
    """
    for url, (title, source) in zip(url_list, titles_sources):
        try:
            filename = sanitize_filename(title, source)
            filepath = RAW_DIR / filename
    
            if filepath.exists():
                print(f"⚠️ Already exists, skipping: {filename}")
                continue
    
            response = requests.get(url, timeout=10)
            response.raise_for_status()
    
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(response.text)
    
            print(f"✅ Downloaded: {filename}")
        except Exception as e:
            print(f"❌ Failed to download {url}: {e}")

In [11]:
urls = [
    "https://www.cdc.gov/asthma/about/index.html",
    "https://www.cdc.gov/asthma/control/index.html",
    "https://www.cdc.gov/asthma/emergency/index.html",
    "https://www.cdc.gov/asthma/living-with/index.html",
    "https://www.cdc.gov/asthma/respiratory-infections/index.html",
    "https://www.cdc.gov/asthma/hcp/clinical-guidance/index.html",
    "https://www.cdc.gov/copd/about/index.html",
    "https://www.cdc.gov/copd/resources/index.html",
    "https://www.cdc.gov/copd/php/key-resources/index.html",
    "https://www.cdc.gov/respiratory-viruses/situation-summary/index.html",
    "https://www.cdc.gov/respiratory-viruses/about/index.html",
    "https://www.cdc.gov/respiratory-viruses/risk-factors/index.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/index.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/immunizations.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/hygiene.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/air-quality.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/precautions-when-sick.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/masks.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/physical-distancing.html",
    "https://www.cdc.gov/respiratory-viruses/prevention/testing.html",
    "https://www.cdc.gov/respiratory-viruses/treatment/index.html",
    "https://www.cdc.gov/respiratory-viruses/tools-resources/index.html",
    "https://www.cdc.gov/respiratory-viruses/hcp/conversation-tips/index.html",
    "https://www.cdc.gov/respiratory-viruses/hcp/clinical-safety/index.html",
    "https://www.cdc.gov/covid/about/index.html",
    "https://www.cdc.gov/covid/signs-symptoms/index.html",
    "https://www.cdc.gov/covid/risk-factors/index.html",
    "https://www.cdc.gov/covid/vaccines/index.html",
    "https://www.cdc.gov/covid/testing/index.html",
    "https://www.cdc.gov/covid/treatment/index.html",
    "https://www.cdc.gov/covid/prevention/index.html",
    "https://www.cdc.gov/covid/vaccines/how-they-work.html",
    "https://www.cdc.gov/covid/vaccines/benefits.html",
    "https://www.cdc.gov/covid/vaccines/stay-up-to-date.html",
    "https://www.cdc.gov/covid/vaccines/myths-facts.html",
    "https://www.cdc.gov/covid/vaccines/covid-19-vaccine-effectiveness.html",
    "https://www.cdc.gov/covid/long-term-effects/index.html",
    "https://www.cdc.gov/covid/long-term-effects/living-with-long-covid.html",
    "https://www.cdc.gov/covid/long-term-effects/long-covid-signs-symptoms.html",
    "https://www.cdc.gov/covid/vaccines/faq.html",
]

titles_sources = [
    ("About Asthma", "CDC"),
    ("Controlling Asthma", "CDC"),
    ("What to Do When an Emergency Occurs", "CDC"),
    ("Living with Asthma", "CDC"),
    ("Respiratory Infections and Asthma", "CDC"),
    ("Clinical Guidance for Asthma, Other Respiratory Conditions, and or Mold Allergy After a Severe Weather Event", "CDC"),
    ("About COPD", "CDC"),
    ("COPD Resources for Patients and Their Families", "CDC"),
    ("COPD Resources for Health Professionals", "CDC"),
    ("What to Know for this Fall and Winter Virus Season", "CDC"),
    ("About Respiratory Illnesses", "CDC"),
    ("Risk Factors for Severe Illness from Respiratory Viruses", "CDC"),	
    ("Preventing Respiratory Viruses", "CDC"),
    ("Immunizations for Respiratory Viruses Prevention", "CDC"),
    ("Hygiene and Respiratory Viruses Prevention", "CDC"),
    ("Taking Steps for Cleaner Air for Respiratory Virus Prevention", "CDC"),
    ("Preventing Spread of Respiratory Viruses When You're Sick", "CDC"),
    ("Masks and Respiratory Viruses Prevention", "CDC"),
    ("About Physical Distancing and Respiratory Viruses", "CDC"),
    ("Testing and Respiratory Viruses", "CDC"),
    ("Treatment of Respiratory Viruses", "CDC"),
    ("Resources to Prepare for Flu, COVID-19, and RSV", "CDC"),
    ("Talking with Patients About Respiratory Virus Season", "CDC"),
    ("Best Practices for Patient Care", "CDC"),
    ("About COVID-19", "CDC"),
    ("Symptoms of COVID-19", "CDC"),
    ("People with Certain Medical Conditions and COVID-19 Risk Factors", "CDC"),
    ("COVID-19 Vaccines", "CDC"),
    ("Testing for COVID-19", "CDC"),
    ("Types of COVID-19 Treatment", "CDC"),
    ("How to Protect Yourself and Others", "CDC"),
    ("COVID-19 Vaccine Basics", "CDC"),
    ("Benefits of Getting Vaccinated", "CDC"),
    ("Staying Up to Date with COVID-19 Vaccines", "CDC"),
    ("Myths & Facts About COVID-19 Vaccines", "CDC"),
    ("COVID-19 Vaccine Effectiveness", "CDC"),
    ("Long COVID Basics", "CDC"),
    ("Living with Long COVID", "CDC"),
    ("Signs and Symptoms of Long COVID", "CDC"),
    ("COVID-19 Vaccine Frequently Asked Questions", "CDC"),
]

download_html_pages(urls, titles_sources)

⚠️ Already exists, skipping: about_asthma_cdc.html
⚠️ Already exists, skipping: controlling_asthma_cdc.html
⚠️ Already exists, skipping: what_to_do_when_an_emergency_occurs_cdc.html
⚠️ Already exists, skipping: living_with_asthma_cdc.html
⚠️ Already exists, skipping: respiratory_infections_and_asthma_cdc.html
⚠️ Already exists, skipping: clinical_guidance_for_asthma,_other_respiratory_conditions,_and_or_mold_allergy_after_a_severe_weather_event_cdc.html
⚠️ Already exists, skipping: about_copd_cdc.html
⚠️ Already exists, skipping: copd_resources_for_patients_and_their_families_cdc.html
⚠️ Already exists, skipping: copd_resources_for_health_professionals_cdc.html
⚠️ Already exists, skipping: what_to_know_for_this_fall_and_winter_virus_season_cdc.html
⚠️ Already exists, skipping: about_respiratory_illnesses_cdc.html
⚠️ Already exists, skipping: risk_factors_for_severe_illness_from_respiratory_viruses_cdc.html
⚠️ Already exists, skipping: preventing_respiratory_viruses_cdc.html
⚠️ Already 

In [13]:
def build_metadata(filename):
    stem = Path(filename).stem  # e.g., "asthma_who"
    parts = stem.split("_")
    if len(parts) >= 2:
        title = " ".join(parts[:-1]).title()         
        source = parts[-1].upper()               
    else:
        title = stem.title()
        source = "Unknown"
    return {
        "title": title,
        "source": source,
    }

In [15]:
def extract_content_from_html(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
    except UnicodeDecodeError:
        with open(filepath, "r", encoding="utf-8-sig", errors="ignore") as f:
            content = f.read()

    soup = BeautifulSoup(content, "html.parser")

    for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
        tag.decompose()

    main = soup.find("main")
    if main:
        content = main.get_text(separator="\n")
    else:
        body = soup.find("body")
        content = body.get_text(separator="\n") if body else soup.get_text(separator="\n")

    lines = content.splitlines()
    lines = [line.strip() for line in lines if line.strip()]
    return "\n".join(lines)

def process_all_downloaded_HTML_pages():
    for file in RAW_DIR.glob("*.html"):
        out_path = PROCESSED_DIR / (file.stem + ".json")
        
        if out_path.exists():
            print(f"⚠️ Already processed, skipping: {out_path.name}")
            continue

        text = extract_content_from_html(file)
        metadata = build_metadata(file)
        metadata["content"] = text

        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, ensure_ascii=False, indent=2)
        print(f"✅ Processed and saved: {out_path.name}")


In [17]:
process_all_downloaded_HTML_pages()

⚠️ Already processed, skipping: about_asthma_cdc.json
⚠️ Already processed, skipping: about_copd_cdc.json
⚠️ Already processed, skipping: about_covid-19_cdc.json
⚠️ Already processed, skipping: about_physical_distancing_and_respiratory_viruses_cdc.json
⚠️ Already processed, skipping: about_respiratory_illnesses_cdc.json
⚠️ Already processed, skipping: benefits_of_getting_vaccinated_cdc.json
⚠️ Already processed, skipping: best_practices_for_patient_care_cdc.json
⚠️ Already processed, skipping: clinical_guidance_for_asthma,_other_respiratory_conditions,_and_or_mold_allergy_after_a_severe_weather_event_cdc.json
⚠️ Already processed, skipping: controlling_asthma_cdc.json
⚠️ Already processed, skipping: copd_resources_for_health_professionals_cdc.json
⚠️ Already processed, skipping: copd_resources_for_patients_and_their_families_cdc.json
⚠️ Already processed, skipping: covid-19_vaccines_cdc.json
⚠️ Already processed, skipping: covid-19_vaccine_basics_cdc.json
⚠️ Already processed, skipping

In [131]:
def chunk_docs():
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", " "]
    )

    for file in os.listdir(PROCESSED_DIR):
        with open(os.path.join(PROCESSED_DIR, file), "r", encoding="utf-8") as f:
            data = json.load(f)

        chunks = splitter.split_text(data["content"])
        for i, chunk in enumerate(chunks):
            chunk_data = {
                "title": data["title"],
                "source": data["source"],
                "chunk_id": i + 1,
                "content": chunk
            }
            chunk_file = os.path.join(CHUNKS_DIR, f"{Path(file).stem}_chunk{i+1}.json")
            with open(chunk_file, "w", encoding="utf-8") as f:
                json.dump(chunk_data, f, ensure_ascii=False, indent=2)

In [137]:
chunk_docs()