In [1]:
import requests
from io import BytesIO
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
import pandas as pd

In [2]:
pdf_urls = {
    "English": "https://health.hawaii.gov/mauiwildfires/files/2023/08/DOH-Fact-Sheet-Lahaina-CHC.pdf",
    "Hawaiian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_HAW.pdf",
    "Marshallese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_MH.pdf",
    "Chuukese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_CHK.pdf",
    "Pohnpeian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_PON.pdf",
    "Tongan": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_TO.pdf",
    "Kosraean": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_KOS.pdf",
}

In [3]:
def download_pdf(url):
    response = requests.get(url)
    return BytesIO(response.content)

In [4]:
def clean_paragraph(paragraph):
    remove_chars = ["•", "", "\n"]
    for char in remove_chars:
        paragraph = paragraph.replace(char, "")
    return paragraph.strip()

In [5]:
def extract_text_with_location(pdf_file):
    resource_manager = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_manager, device)
    
    text_with_locations = []  # List to hold text and its locations
    for page in PDFPage.get_pages(pdf_file):
        interpreter.process_page(page)
        layout = device.get_result()
        for element in layout:
            if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
                x0, y0, x1, y1 = element.bbox
                first_line = True  # Flag to mark the first line of a paragraph
                for text_line in element:
                    if isinstance(text_line, LTTextLine):
                        text = clean_paragraph(text_line.get_text())
                        # If the first line of a paragraph or a single-line paragraph, take its y-coordinate
                        if text and first_line:
                            y_coordinate = y0
                            first_line = False
                        text_with_locations.append((y_coordinate, text))
                first_line = True  # Reset the flag for the next text box
    return text_with_locations

In [6]:
texts_by_language = {}
locations_by_language = {}

for language, url in pdf_urls.items():
    pdf_file = download_pdf(url)
    text_and_locations = extract_text_with_location(pdf_file)
    
    # Separate text and location information
    locations_by_language[language], texts_by_language[language] = zip(*text_and_locations)

# Find the maximum number of paragraphs across all languages to ensure DataFrame completeness
max_paragraphs = max(len(paragraphs) for paragraphs in texts_by_language.values())

# Create a list of dictionaries for each paragraph across all languages
data = []
for i in range(max_paragraphs):
    row = {'Location': None}
    for language in pdf_urls.keys():
        if i < len(texts_by_language[language]):
            row[language] = texts_by_language[language][i]
            # Only add location info if it's not already added
            if row['Location'] is None:
                row['Location'] = locations_by_language[language][i]
        else:
            row[language] = ""  # Fill in blank for languages with fewer paragraphs
    data.append(row)

In [7]:
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,Location,English,Hawaiian,Marshallese,Chuukese,Pohnpeian,Tongan,Kosraean
0,675.410919,WEST MAUI HEALTH SERVICES,WEST MAUI HEALTH SERVICES,JIBAÑ IN EJMOUR KO ILO RĀK IN MAUI,WEST MAUI ANGANGEN,SAWAS SANG WEST MAUI HEALTH,WEST MAUI HEALTH SERVICES,MWE KAHSRUH SIN WEST MAUI HEALTH
1,675.410919,Local Clinic Services,Nā Hana Lawelawe o nā,Jibañ ko ilo Jikin Taktõ ko,ANINISIN SAFEI KENA,Sawas en Local Clinic,Ngaahi ngāue ‘o,Mwe Kahsruh Ke Local Clinic
2,626.631968,Lāhainā Comprehensive Health Center,Ke‘ena Kauka Kūloko,Redik ilo Jukjukinbed,Angangen Aninisin Nenien,Lāhainā Comprehensive Health Center,e Kilīniki Fakafeitu’ú,Lāhainā Comprehensive Health Center
3,574.329886,Medical Services,Lāhainā Comprehensive Health Center,Lāhainā Comprehensive Health Center,Safei Kena Non Neniach,Sawas en Wini kan,Lāhainā Comprehensive Health Center,Mwe Kahsruh luhn Ono
4,574.399349,Behavioral Health,Nā Hana Lawelawe Olakino,Jibañ in Ejmour,Lāhainā Comprehensive Health Center,Awa kan: Ni Ed lel Ni Alem,Ngaahi Ngāue,Ao: Monday nuhke Friday
5,507.209547,Hours: Monday - Friday,Kūlana No‘ono‘o i ka Hanah,Nañinmij ko ilo Manit,Aninisin Safei kena,9:00 Ni Menseng lel 4:00 Ni Souwas,Fakafaito’ó,9:00 Litutacng nuhke 4:00 Infulwacn lwen
6,507.209547,9:00 AM – 4:00 PM,Nā Hola: Pō‘akahi – Pō‘alima,Awa ko: Mande - Bolaide,Semwenin Napenap,Eker Nempe wet: (808) 871-7772,Tokangaekina ‘o e,Pahngohn nacmpuh inge: (808) 871-7772
7,507.209547,Contact: (808) 871-7772,9:00 AM – 4:00 PM,Awa ko: Mande - Jabot,Awan angang kena: Sarinfan - Animu,Anahne Koasoanehdi Rahn en Kohdo,Tō’onga Mo’uí,Ahkkweyeyck in Oraclah Appointment
8,490.709548,Appointments Recommended,Kelepona: (808) 871-7772,9:00 Awa Jibbon – 4:00 Awa Raelap,9:00 AM – 4:00 PM,"Se Ale Insurance: Medicare, Medicaid",‘Ū Houá: Mōnite - Falaite,"Kuht Eis Insurance: Medicare, Medicaid ac"
9,432.301921,"Insurance Accepted: Medicare,",Nā Hola: Pō‘akahi – Lāpule,9:00 Awa jibbon – 4:00 Awa Raelap,Ia sipwe kokono ie: (808) 871-7772,oh insurance laud teikan (ahpw kaidehn,9:00 AM – 4:00 PM,oakwuck in insurance sahyac (sahyacn
