In [1]:
import requests
from pdf2image import convert_from_bytes
import pytesseract
from PIL import Image
import pandas as pd

In [2]:
pdf_urls = {
    "English": "https://health.hawaii.gov/mauiwildfires/files/2023/08/DOH-Fact-Sheet-Lahaina-CHC.pdf",
    "Hawaiian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_HAW.pdf",
    "Marshallese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_MH.pdf",
    "Chuukese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_CHK.pdf",
    "Pohnpeian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_PON.pdf",
    "Tongan": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_TO.pdf",
    "Kosraean": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_KOS.pdf"
}

In [3]:
def download_pdf(url):
    """Download PDF from a given URL and return the content."""
    response = requests.get(url)
    response.raise_for_status()  # Ensures we notice bad responses
    return response.content

In [4]:
def pdf_to_images(pdf_content):
    """Convert PDF byte content to a list of PIL Images."""
    return convert_from_bytes(pdf_content)

def extract_text_from_images(images):
    """Extract text from a list of PIL Images using Tesseract OCR with the English language pack."""
    full_text = ""
    for image in images:
        text = pytesseract.image_to_string(image, lang='eng')
        full_text += text + "\n"
    return full_text

In [5]:
def segment_text_to_paragraphs(text):
    # Simple segmentation based on two newlines; adjust as needed
    paragraphs = [para.strip() for para in text.split("\n\n") if para.strip()]
    return paragraphs

In [6]:
def process_pdf(url):
    pdf_content = download_pdf(url)
    images = pdf_to_images(pdf_content)
    text = extract_text_from_images(images)
    paragraphs = segment_text_to_paragraphs(text)
    return paragraphs

In [9]:
data = {
    "English": [],
    "Hawaiian": [],
    "Marshallese": [],
    "Chuukese": [],
    "Pohnpeian": [],
    "Tongan": [],
    "Kosraean": []
}

In [10]:
for language, url in pdf_urls.items():
    print(f"Processing {language} PDF...")
    paragraphs = process_pdf(url)
    data[language].extend(paragraphs)

Processing English PDF...
Processing Hawaiian PDF...
Processing Marshallese PDF...
Processing Chuukese PDF...
Processing Pohnpeian PDF...
Processing Tongan PDF...
Processing Kosraean PDF...


In [11]:
data

{'English': ['® seavoratreiricuwe WIEST MAUI HEALTH SERVICES',
  'MAUI COUNTY',
  'ama I Ke Local Clinic Services',
  'Lahaina Comprehensive Health Center',
  '& Malama | Ke Ola',
  'Medical Services',
  'Hours: Monday - Friday',
  '9:00 AM - 4:00 PM',
  'Contact: (808) 871-7772\nAppointments Recommended',
  'Insurance Accepted: Medicare,\nMedicaid & major insurance plans\n(except Kaiser, Humana & Tricare)',
  'Services',
  'e Adult Medicine: Monday-Friday\ne Pediatrics: Monday',
  'e Integrated Health: Tuesday',
  'e Dental Care: Wednesday',
  '9 @ Lahaina Civic Center',
  'Lahaina Civic\nCenter Tickets',
  'Lahaina Comprehensive\nHealth Center',
  '_ Location:\nAkoakoa Place (just below',
  'Lahaina Civic Center)\nWayside Park\nintipark with BBQs & pavilions',
  'fan\\',
  '*Hours and services subject to change',
  'Behavioral Health',
  'Hours: Monday - Sunday',
  '9:00 AM - 4:00 PM',
  'Contact: (808) 495-5113\nWalk-Ins Welcome',
  'Insurance Accepted But Not Required',
  'Adult & 