In [1]:
import requests
import fitz
import pandas as pd

In [2]:
pdf_urls = {
    "English": "https://health.hawaii.gov/mauiwildfires/files/2023/08/DOH-Fact-Sheet-Lahaina-CHC.pdf",
    "Hawaiian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_HAW.pdf",
    "Marshallese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_MH.pdf",
    "Chuukese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_CHK.pdf",
    "Pohnpeian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_PON.pdf",
    "Tongan": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_TO.pdf",
    "Kosraean": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_KOS.pdf"
}

In [12]:
def fetch_and_extract_text_sorted(url):
    """Fetch a PDF from a URL and extract its text, attempting to sort by position."""
    response = requests.get(url)
    response.raise_for_status()
    sorted_paragraphs = []
    with fitz.open("pdf", response.content) as doc:
        for page in doc:
            blocks = page.get_text("blocks")
            blocks.sort(key=lambda block: (block[1], block[0]))  # Sort by y, then by x
            page_text = [block[4] for block in blocks]
            sorted_paragraphs.append(page_text)
    return sorted_paragraphs

In [None]:
def clean_and_preprocess(text_blocks):
    """Clean and preprocess text blocks extracted from a PDF."""
    cleaned_text = []
    for page in text_blocks:
        cleaned_page = []
        for text in page:
            text = text.replace('\n', ' ').strip()  # Replace new lines with spaces, remove extra spaces
            # Implement additional cleaning here as needed
            cleaned_page.append(text)
        cleaned_text.append(' '.join(cleaned_page))  # Join all texts in a page into a single string
    return cleaned_text

In [13]:
fetch_and_extract_text_sorted(pdf_urls["English"])

[['WEST MAUI HEALTH SERVICES\n',
  'Local Clinic Services\n',
  'Lāhainā Comprehensive Health Center\n',
  'Behavioral Health \n',
  'Medical Services\n',
  'Hours: Monday - Friday \n9:00 AM – 4:00 PM\nContact: (808) 871-7772\n',
  'Hours: Monday - Sunday\n9:00 AM – 4:00 PM\nContact: (808) 495-5113\n',
  'Appointments Recommended\n',
  'Walk-Ins Welcome\n',
  'Insurance Accepted But Not Required\n',
  'Insurance Accepted: Medicare,\n',
  'Medicaid & major insurance plans\n(except Kaiser, Humana & Tricare)\n',
  'Adult & Youth Services\n',
  'Resource Guidance    \nSupportive Counseling: Individual,\nGroup, and Family    \nMedication Management     \nPsychiatric Services      \nOutpatient Substance Use Disorder\nServices\nWellness Support Groups – Parenting,\nStress Management & Mindfulness\n',
  'Services\n',
  'Adult Medicine: Monday-Friday\nPediatrics: Monday \nIntegrated Health: Tuesday                        \nDental Care: Wednesday                             \n',
  'Partners\n',


In [15]:
fetch_and_extract_text_sorted(pdf_urls["Hawaiian"])

[['WEST MAUI HEALTH SERVICES \nNā Hana Lawelawe o nā \nKe‘ena Kauka Kūloko \n',
  'Lāhainā Comprehensive Health Center \n',
  'Nā Hana Lawelawe Olakino\n',
  'Kūlana No‘ono‘o i ka Hanah\n',
  'Nā Hola: Pō‘akahi – Pō‘alima \n',
  'Nā Hola: Pō‘akahi – Lāpule \n',
  '9:00 AM – 4:00 PM \n',
  '9:00 AM – 4:00 PM \n',
  'Kelepona: (808) 871-7772 \n',
  'Kelepona: (808) 495-5113 \n',
  'Paipai ‘ia e Ho‘opa‘a i ka Hālāwai \n',
  '‘Ae ‘ia ka Po‘e Komo Wale mai \n',
  '‘Ae ‘ia ka ‘Inikua: Medicare, Medicaid a me \n',
  '‘Ae ‘ia ka ‘Inikua, akā, ‘A‘ole Pono \n',
  'nā papa hana ‘inikua inoa kaulana (koe ke \n',
  'Nā Hana Lawelawe no nā Kānaka \nMākua a me nā ‘Ōpio \n',
  'Kaiser, Humana & Tricare) \n',
  'Nā Hana Lawelawe \n',
  '\uf03d Kuhikuhi Kumu Waiwai \n',
  '\uf03d Lā‘au Kānaka Makua: Pō‘akahi – Pō‘alima \n',
  '\uf03d Kauleo Kāko‘o: No ke Kanaka, ka Hui, a me ka \n',
  '\uf03d Lapa‘au Keiki: Pō‘akahi \n',
  '‘Ohana \n',
  '\uf03d Nā Nīnau Like ‘Ole o ke Olakino: Pō‘alua  \n',
  '\uf03d H

In [4]:
language_paragraphs = {lang: fetch_and_extract_paragraphs(url) for lang, url in pdf_urls.items()}

In [5]:
mappings = []
for i in range(len(language_paragraphs["English"])):  # Assuming 'en' (English) as the base
    row = {lang: language_paragraphs[lang][i] for lang in language_paragraphs}
    mappings.append(row)

In [6]:
df_mappings = pd.DataFrame(mappings)

In [7]:
df_mappings

Unnamed: 0,English,Hawaiian,Marshallese,Chuukese,Pohnpeian,Tongan,Kosraean
0,WEST MAUI HEALTH SERVICES,LCHC,LCHC,LCHC,LCHC,LCHC,LCHC
1,Local Clinic Services,Ho‘oponopono ‘ia 10/18/23,*Awa in jerbal in jibañ ko remaron oktak,*Awa kena me pwan angangen aninis kena repwene...,*Awa oh sawas kan kak wekila,*’E ala liliu e ngaahi houá mo e ngaahi ngāué,*Ao ac kahsruh uh kuh na in eklac
2,Behavioral Health,WEST MAUI HEALTH SERVICES,Emõj kõkkāāl melele ilo 10/18/23,Asofono 10/18/23,"Kakapwla ni October 18, 2023",Liliu Fakamuimuitahá 10/18/23,"Ahksasucyeyucklac ke October 18, 2023"
3,Partners,Nā Hana Lawelawe o nā,JIBAÑ IN EJMOUR KO ILO RĀK IN MAUI,WEST MAUI ANGANGEN,SAWAS SANG WEST MAUI HEALTH,WEST MAUI HEALTH SERVICES,MWE KAHSRUH SIN WEST MAUI HEALTH
4,Medical Services,Ke‘ena Kauka Kūloko,Jibañ ko ilo Jikin Taktõ ko,ANINISIN SAFEI KENA,Sawas en Local Clinic,Ngaahi ngāue ‘o,Mwe Kahsruh Ke Local Clinic
5,Location:,Lāhainā Comprehensive Health Center,Redik ilo Jukjukinbed,Angangen Aninisin Nenien,Lāhainā Comprehensive Health Center,e Kilīniki Fakafeitu’ú,Lāhainā Comprehensive Health Center
6,Ākoakoa Place (just below,Nā Hana Lawelawe Olakino,Lāhainā Comprehensive Health Center,Safei Kena Non Neniach,Sawas en Wini kan,Lāhainā Comprehensive Health Center,Mwe Kahsruh luhn Ono
7,Lāhainā Civic Center),Nā Hola: Pō‘akahi – Pō‘alima,Jibañ in Ejmour,Lāhainā Comprehensive Health Center,Awa kan: Ni Ed lel Ni Alem,Ngaahi Ngāue,Ao: Monday nuhke Friday
8,Lāhainā Comprehensive Health Center,9:00 AM – 4:00 PM,Awa ko: Mande - Bolaide,Aninisin Safei kena,9:00 Ni Menseng lel 4:00 Ni Souwas,Fakafaito’ó,9:00 Litutacng nuhke 4:00 Infulwacn lwen
9,Hours: Monday - Friday,Kelepona: (808) 871-7772,9:00 Awa Jibbon – 4:00 Awa Raelap,Awan angang kena: Sarinfan - Animu,Eker Nempe wet: (808) 871-7772,‘Ū Houá: Mōnite - Falaite,Pahngohn nacmpuh inge: (808) 871-7772


In [6]:
# extracted_texts = []

In [7]:
# for url, language in pdf_links:
#     text = fetch_and_extract_text(url)
#     extracted_texts.append((language, text))

In [8]:
# aligned_data = [
   
# ]

In [9]:
# df = pd.DataFrame(extracted_data)

In [7]:
csv_to_path = "local_clinic_services.csv"
df.to_csv(csv_to_path, index=False)

In [10]:
# view df
df

In [11]:
df