In [1]:
import requests
from io import BytesIO
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import pandas as pd

In [2]:
pdf_urls = {
    "English": "https://health.hawaii.gov/mauiwildfires/files/2023/08/DOH-Fact-Sheet-Lahaina-CHC.pdf",
    "Hawaiian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_HAW.pdf",
    "Marshallese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_MH.pdf",
    "Chuukese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_CHK.pdf",
    "Pohnpeian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_PON.pdf",
    "Tongan": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_TO.pdf",
    "Kosraean": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_KOS.pdf",
}

In [3]:
def clean_paragraph(paragraph):
    remove_chars = ["•", "", "\n"]
    for char in remove_chars:
        paragraph = paragraph.replace(char, "")
    return paragraph.strip()

In [4]:
def download_pdf(url):
    response = requests.get(url)
    return BytesIO(response.content)

In [5]:
def extract_paragraphs_and_locations(pdf_file):
    paragraphs = []
    locations = []
    for page_layout in extract_pages(pdf_file):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text_content = ""
                for text_line in element:
                    text_content += text_line.get_text()
                cleaned_text = clean_paragraph(text_content)
                if cleaned_text:
                    # Capture the starting location of the text container (top-left corner)
                    x, y, _, _ = element.bbox
                    paragraphs.append(cleaned_text)
                    locations.append((x, y))
    return paragraphs, locations

In [8]:
# Process each PDF and collect paragraphs and their locations
texts_by_language = {}
locations_by_language = {}

for language, url in pdf_urls.items():
    pdf_file = download_pdf(url)
    paragraphs, locations = extract_paragraphs_and_locations(pdf_file)
    texts_by_language[language] = paragraphs
    locations_by_language[language] = locations

# Find the maximum number of paragraphs across all languages to ensure DataFrame completeness
max_paragraphs = max(len(texts) for texts in texts_by_language.values())

# Construct the DataFrame
data = []

for i in range(max_paragraphs):
    row = {}
    for language in pdf_urls.keys():
        row[f'{language}_Text'] = texts_by_language[language][i] if i < len(texts_by_language[language]) else ""
        row[f'{language}_Location'] = locations_by_language[language][i] if i < len(locations_by_language[language]) else ""
    data.append(row)

df = pd.DataFrame(data)

In [9]:
df

Unnamed: 0,English_Text,English_Location,Hawaiian_Text,Hawaiian_Location,Marshallese_Text,Marshallese_Location,Chuukese_Text,Chuukese_Location,Pohnpeian_Text,Pohnpeian_Location,Tongan_Text,Tongan_Location,Kosraean_Text,Kosraean_Location
0,WEST MAUI HEALTH SERVICES,"(185.4320274736652, 715.1743046834916)",WEST MAUI HEALTH SERVICES,"(191.04, 733.890774)",JIBAÑ IN EJMOUR KO ILO RĀK IN MAUI,"(178.04, 732.8352)",WEST MAUI ANGANGEN,"(251.54, 746.6351999999999)",SAWAS SANG WEST MAUI HEALTH,"(185.42, 732.36)",WEST MAUI HEALTH SERVICES,"(191.04, 733.0507739999999)",MWE KAHSRUH SIN WEST MAUI HEALTH,"(185.42, 733.3104)"
1,Lāhainā Comprehensive Health Center,"(102.17987169250517, 626.631967712209)",Lāhainā Comprehensive Health Center,"(118.56, 611.119725)",Jibañ ko ilo Jikin Taktõ ko,"(210.38, 690.9648000000001)",Angangen Aninisin Nenien,"(202.64, 682.3248)",Lāhainā Comprehensive Health Center,"(118.58, 654.9304)",Lāhainā Comprehensive Health Center,"(118.56, 625.399725)",Mwe Kahsruh Ke Local Clinic,"(204.17, 694.0600000000001)"
2,Medical Services,"(62.84125868161412, 574.3298858914625)",Nā Hana Lawelawe Olakino,"(45.96, 565.34844)",Lāhainā Comprehensive Health Center,"(99.36, 624.4552)",Lāhainā Comprehensive Health Center,"(99.36, 615.8152)",Sawas en Wini kan,"(63.024, 607.1704)",Ngaahi Ngāue,"(93.84, 580.5994890000001)",Lāhainā Comprehensive Health Center,"(93.984, 647.6296)"
3,Behavioral Health,"(355.14830480215335, 574.3993490885682)",Kūlana No‘ono‘o i ka Hanah,"(336.72, 565.34844)",Jibañ in Ejmour,"(81.96, 576.6448)",Aninisin Safei kena,"(60.48, 568.0047999999999)",Awa kan: Ni Ed lel Ni Alem,"(40.32, 581.0104)",Tokangaekina ‘o e,"(363.84, 579.639489)",Mwe Kahsruh luhn Ono,"(44.76, 601.06)"
4,Hours: Monday - Friday,"(42.695638221015, 540.2095455912682)",Nā Hola: Pō‘akahi – Pō‘alima,"(40.32, 538.2971550000001)",Nañinmij ko ilo Manit,"(335.56, 576.6448)",Semwenin Napenap,"(346.78, 568.0047999999999)",9:00 Ni Menseng lel 4:00 Ni Souwas,"(40.32, 563.9704)",‘Ū Houá: Mōnite - Falaite,"(40.56, 534.142452)",Ao: Monday nuhke Friday,"(40.32, 572.38)"
5,Appointments Recommended,"(71.93391825275329, 490.7095476537683)",9:00 AM – 4:00 PM,"(40.32, 522.8171550000001)",Awa ko: Mande - Bolaide,"(33.24, 550.4848)",Awan angang kena: Sarinfan - Animu,"(40.32, 542.0247999999999)",Eker Nempe wet: (808) 871-7772,"(40.32, 547.1704000000001)",‘Ū Houá: Mōnite - Sāpate,"(329.76, 531.3824520000001)",9:00 Litutacng nuhke 4:00 Infulwacn lwen,"(40.32, 553.78)"
6,"Insurance Accepted: Medicare,","(64.09407482941344, 465.3019199124195)",Kelepona: (808) 871-7772,"(40.32, 506.497155)",Awa ko: Mande - Jabot,"(319.06, 550.4848)",Ra kan Peseieno Appointment kena,"(72.0, 497.9848)",Anahne Koasoanehdi Rahn en Kohdo,"(67.584, 529.7404)",Ngaahi ‘Apoinimeni ‘Oku Fokotu’u Atú,"(66.48, 486.382452)",Ahkkweyeyck in Oraclah Appointment,"(66.384, 517.99)"
7,Services,"(113.0667290388861, 402.34630730119517)",Nā Hola: Pō‘akahi – Lāpule,"(329.64000000000004, 538.2971550000001)",9:00 Awa Jibbon – 4:00 Awa Raelap,"(33.24, 533.5047999999999)","Ra Asepa Insurens Medicare, Medicaid","(58.74, 477.3448)","Se Ale Insurance: Medicare, Medicaid","(62.064, 503.94039999999995)",Talitali Lelei ‘a e A’utonu Mai Ta’e’apoinimení,"(342.36, 483.742452)","Kuht Eis Insurance: Medicare, Medicaid ac","(44.88, 490.75)"
8,Adult Medicine: Monday-Friday,"(51.41037373790101, 378.90192351241956)",9:00 AM – 4:00 PM,"(329.64000000000004, 522.8171550000001)",9:00 Awa jibbon – 4:00 Awa Raelap,"(319.06, 533.5047999999999)",Angagen aninis kena,"(74.4, 420.8152)",oh insurance laud teikan (ahpw kaidehn,"(54.84, 486.9004)","Tali ‘a e Malu’i Mo’ui: Medicare, Medicaid","(60.84, 465.382452)",oakwuck in insurance sahyac (sahyacn,"(54.84, 472.27)"
9,Hours: Monday - Sunday,"(345.85425358940546, 540.2095455912682)",Kelepona: (808) 495-5113,"(329.64000000000004, 506.497155)",Nomba eo ñan Kebaak: (808) 871-7772,"(33.24, 516.6447999999999)",Safean Aramas Nap: Sarinfan-Animu,"(40.32, 400.0048)","Kaiser, Humana oh Tricare)","(87.624, 469.86039999999997)",Ngaahi Ngāué,"(100.8, 407.66844000000003)","Kaiser, Humana ac Tricare)","(87.144, 453.67)"


In [10]:
df.to_csv('local_clinic_services_with_location.csv')