In [1]:
import io
import requests
from pdfminer.high_level import extract_text
import pandas as pd
from typing import List

In [2]:
def download_and_extract_text(url: str) -> str:
    response = requests.get(url)
    with io.BytesIO(response.content) as open_pdf_file:
        text = extract_text(open_pdf_file)
    return text

In [10]:
def split_text(text: str, by_paragraph=True) -> List[str]:
    if by_paragraph:
        return text.split('\n\n')  # Simple paragraph splitting by double newline
    else:
        # This is a simple approach for sentence splitting. You might need a more sophisticated approach for better accuracy.
        return text.replace('\n', ' ').split('. ')

In [4]:
pdf_urls = {
    "English": "https://health.hawaii.gov/mauiwildfires/files/2023/08/DOH-Fact-Sheet-Lahaina-CHC.pdf",
    "Hawaiian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_HAW.pdf",
    "Marshallese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_MH.pdf",
    "Chuukese": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_CHK.pdf",
    "Pohnpeian": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_PON.pdf",
    "Tongan": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_TO.pdf",
    "Kosraean": "https://health.hawaii.gov/mauiwildfires/files/2023/09/DOH-Fact-Sheet-Lahaina-CHC_KOS.pdf"
}

In [11]:
texts_by_language = {lang: split_text(download_and_extract_text(url)) for lang, url in pdf_urls.items()}

In [12]:
# To accommodate varying lengths, create a list of dictionaries for each row
data = []
max_length = max(len(texts) for texts in texts_by_language.values())
for i in range(max_length):
    row = {lang: texts[i] if i < len(texts) else '' for lang, texts in texts_by_language.items()}
    data.append(row)

In [13]:
df = pd.DataFrame(data)

In [14]:
df

Unnamed: 0,English,Hawaiian
0,FREE* Mental Health Services \nand Support Gr...,MANUAHI* Ke Kōkua no ka Pō‘ino o ka No‘ono‘o \...
1,Wailuku Health Center \n Crisis Menta...,Wailuku Health Center \nKōkua Pō‘ino o ka No‘o...
2,• For those experiencing distress as a result,Lāhaina Health Center \nKōkua Pō‘ino o ka No‘o...
3,of the Maui wildfires,•
4,"• Mental health counseling, trauma",• No ka po‘e i lo‘ohia i ka pilihua o ka mana‘...
5,"processing, psychiatric services, resources",•\n•\n•\n• MauiWellness@doh.hawaii.gov
6,• Mon-Fri 7:45am - 4:30pm \n• 121 Mahalani ...,• Nā hana lawelawe pōpilikia o ka mana‘o\nno n...
7,(808) 984-2150,•\n•\n•\n•\n•
8,Lahaina Health Center \nCrisis Mental Health S...,Maui Child & Adolescent
9,• Adult and youth behavioral health services ...,Māhele Pōpilikia o ka Mana‘o \n(Mental Health ...


In [15]:
df = pd.DataFrame.from_dict(texts_by_language, orient='index').transpose()

In [16]:
df

Unnamed: 0,English,Hawaiian
0,FREE* Mental Health Services \nand Support Gr...,MANUAHI* Ke Kōkua no ka Pō‘ino o ka No‘ono‘o \...
1,Wailuku Health Center \n Crisis Menta...,Wailuku Health Center \nKōkua Pō‘ino o ka No‘o...
2,• For those experiencing distress as a result,Lāhaina Health Center \nKōkua Pō‘ino o ka No‘o...
3,of the Maui wildfires,•
4,"• Mental health counseling, trauma",• No ka po‘e i lo‘ohia i ka pilihua o ka mana‘...
5,"processing, psychiatric services, resources",•\n•\n•\n• MauiWellness@doh.hawaii.gov
6,• Mon-Fri 7:45am - 4:30pm \n• 121 Mahalani ...,• Nā hana lawelawe pōpilikia o ka mana‘o\nno n...
7,(808) 984-2150,•\n•\n•\n•\n•
8,Lahaina Health Center \nCrisis Mental Health S...,Maui Child & Adolescent
9,• Adult and youth behavioral health services ...,Māhele Pōpilikia o ka Mana‘o \n(Mental Health ...


In [17]:
df.to_csv('mappings2.csv', index=False)