In [1]:
import io
import requests
from pdfminer.high_level import extract_text
import pandas as pd
from typing import List

In [2]:
def download_and_extract_text(url: str) -> str:
    response = requests.get(url)
    with io.BytesIO(response.content) as open_pdf_file:
        text = extract_text(open_pdf_file)
    return text

In [3]:
def split_text(text: str, by_paragraph=True) -> List[str]:
    if by_paragraph:
        return text.split('\n\n')  # Simple paragraph splitting by double newline
    else:
        # This is a simple approach for sentence splitting. You might need a more sophisticated approach for better accuracy.
        return text.replace('\n', ' ').split('. ')

In [4]:
pdf_urls = {
    "English": "https://health.hawaii.gov/mauiwildfires/files/2023/08/DOH-Fact-Sheet-Lahaina-CHC.pdf",
    "Hawaiian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_HAW.pdf",
    "Marshallese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_MH.pdf",
    "Chuukese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_CHK.pdf",
    "Pohnpeian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_PON.pdf",
    "Tongan": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_TO.pdf",
    "Kosraean": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_KOS.pdf"
}

In [5]:
texts_by_language = {lang: split_text(download_and_extract_text(url)) for lang, url in pdf_urls.items()}

In [6]:
# To accommodate varying lengths, create a list of dictionaries for each row
data = []
max_length = max(len(texts) for texts in texts_by_language.values())
for i in range(max_length):
    row = {lang: texts[i] if i < len(texts) else '' for lang, texts in texts_by_language.items()}
    data.append(row)

In [7]:
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,English,Hawaiian,Marshallese,Chuukese,Pohnpeian,Tongan,Kosraean
0,WEST MAUI HEALTH SERVICES\nLocal Clinic Services,WEST MAUI HEALTH SERVICES \nNā Hana Lawelawe o...,JIBAÑ IN EJMOUR KO ILO RĀK IN MAUI,WEST MAUI ANGANGEN \nANINISIN SAFEI KENA,SAWAS SANG WEST MAUI HEALTH \nSawas en Local C...,WEST MAUI HEALTH SERVICES \nNgaahi ngāue ‘o \...,MWE KAHSRUH SIN WEST MAUI HEALTH
1,Lāhainā Comprehensive Health Center,Lāhainā Comprehensive Health Center,Jibañ ko ilo Jikin Taktõ ko \nRedik ilo Jukjuk...,Angangen Aninisin Nenien \nSafei Kena Non Neni...,Lāhainā Comprehensive Health Center,Lāhainā Comprehensive Health Center,Mwe Kahsruh Ke Local Clinic
2,Medical Services,Nā Hana Lawelawe Olakino,Lāhainā Comprehensive Health Center,Lāhainā Comprehensive Health Center,Sawas en Wini kan,Ngaahi Ngāue \nFakafaito’ó,Lāhainā Comprehensive Health Center
3,Behavioral Health,Kūlana No‘ono‘o i ka Hanah,Jibañ in Ejmour,Aninisin Safei kena,Awa kan: Ni Ed lel Ni Alem,Tokangaekina ‘o e \nTō’onga Mo’uí,Mwe Kahsruh luhn Ono
4,Hours: Monday - Friday \n9:00 AM – 4:00 PM\nCo...,Nā Hola: Pō‘akahi – Pō‘alima,Nañinmij ko ilo Manit,Semwenin Napenap,9:00 Ni Menseng lel 4:00 Ni Souwas,‘Ū Houá: Mōnite - Falaite \n9:00 AM – 4:00 PM...,Ao: Monday nuhke Friday
5,Appointments Recommended,9:00 AM – 4:00 PM,Awa ko: Mande - Bolaide,Awan angang kena: Sarinfan - Animu \n9:00 AM –...,Eker Nempe wet: (808) 871-7772,‘Ū Houá: Mōnite - Sāpate \n9:00 AM – 4:00 PM \...,9:00 Litutacng nuhke 4:00 Infulwacn lwen \nPah...
6,"Insurance Accepted: Medicare,\nMedicaid & majo...",Kelepona: (808) 871-7772,Awa ko: Mande - Jabot,Ra kan Peseieno Appointment kena,Anahne Koasoanehdi Rahn en Kohdo,Ngaahi ‘Apoinimeni ‘Oku Fokotu’u Atú,Ahkkweyeyck in Oraclah Appointment
7,Services,Nā Hola: Pō‘akahi – Lāpule,9:00 Awa Jibbon – 4:00 Awa Raelap,"Ra Asepa Insurens Medicare, Medicaid \n& watte...","Se Ale Insurance: Medicare, Medicaid",Talitali Lelei ‘a e A’utonu Mai Ta’e’apoinimení,"Kuht Eis Insurance: Medicare, Medicaid ac"
8,Adult Medicine: Monday-Friday\nPediatrics: Mon...,9:00 AM – 4:00 PM,9:00 Awa jibbon – 4:00 Awa Raelap,Angagen aninis kena,oh insurance laud teikan (ahpw kaidehn,"Tali ‘a e Malu’i Mo’ui: Medicare, Medicaid \n&...",oakwuck in insurance sahyac (sahyacn
9,Hours: Monday - Sunday\n9:00 AM – 4:00 PM\nCon...,Kelepona: (808) 495-5113,Nomba eo ñan Kebaak: (808) 871-7772, Safean Aramas Nap: Sarinfan-Animu \n Tumu...,"Kaiser, Humana oh Tricare)",Ngaahi Ngāué,"Kaiser, Humana ac Tricare)"


In [15]:
df = pd.DataFrame.from_dict(texts_by_language, orient='index').transpose()

In [16]:
df

Unnamed: 0,English,Hawaiian
0,FREE* Mental Health Services \nand Support Gr...,MANUAHI* Ke Kōkua no ka Pō‘ino o ka No‘ono‘o \...
1,Wailuku Health Center \n Crisis Menta...,Wailuku Health Center \nKōkua Pō‘ino o ka No‘o...
2,• For those experiencing distress as a result,Lāhaina Health Center \nKōkua Pō‘ino o ka No‘o...
3,of the Maui wildfires,•
4,"• Mental health counseling, trauma",• No ka po‘e i lo‘ohia i ka pilihua o ka mana‘...
5,"processing, psychiatric services, resources",•\n•\n•\n• MauiWellness@doh.hawaii.gov
6,• Mon-Fri 7:45am - 4:30pm \n• 121 Mahalani ...,• Nā hana lawelawe pōpilikia o ka mana‘o\nno n...
7,(808) 984-2150,•\n•\n•\n•\n•
8,Lahaina Health Center \nCrisis Mental Health S...,Maui Child & Adolescent
9,• Adult and youth behavioral health services ...,Māhele Pōpilikia o ka Mana‘o \n(Mental Health ...


In [9]:
df.to_csv('mappings2.csv', index=False)