# Parsing AHCM-HRSN screening PDFs

Tried various extractors before such as pymupdf, pypdf2 and pdfminer but wasn't getting the exact extractions as needed. So tried using two methods pdfplumber and tesseract OCR(pdfs converted to images and then text extracted from images). And pdfplumber has been the msot favorite by now.

In [None]:
pip install pymupdf pandas

In [None]:
!pip install pdf2image pytesseract pandas
!apt-get install -y poppler-utils  # Required for pdf2image

In [None]:
!apt-get update
!apt-get install -y tesseract-ocr

In [None]:
!pip install pytesseract
!pip install pdf2image
!pip install pillow

In [None]:
import pytesseract

# Set the correct Tesseract path
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

#Extraction using Tesseract OCR

##1. Extraction from a single pdf

In [None]:
import pytesseract
import pandas as pd
import re
from pdf2image import convert_from_path

In [None]:
pdf_path = "/content/drive/MyDrive/ahcm_redacted/redacted_01cb6dae438c.pdf"

In [None]:
# Unwanted section headers and instruction blocks
UNWANTED_TEXTS = [
    "Living Situation", "Food", "Transportation", "Utilities", "Safety",
    "Financial Strain", "Employment", "Family and Community Support", "Education",
    "Physical Activity", "Substance Use", "Mental Health", "Disabilities",
    "Choose all the apply",
    "Please answer whether the statements were OFTEN, SOMETIMES, or NEVER true for you and your household in the last 12 months.",
    "Calculate [“number of days” selected] x [“number of minutes” selected] = [number of minutes of exercise per week] 2. Apply the right age threshold: Under 6 years old: You can’t find the physical activity need for people under 6. Age 6 to 17: Less than an average of 60 minutes a day shows an HRSN. Age 18 or older: Less than 150 minutes a week shows an HRSN.",
    "Some people have made the following statements about their food situation",
    "Because violence and abuse happens to a lot of people and affects their health",
    "For example, starting or completing job training or getting a high school diploma, GED or equivalent.",
    "Point Total:()", "when the numerical values for answers to questions 3-10 are added shows that the person might not be safe.",
    "A score of 11 or more", "Follow these 2 steps to decide",
    "The next questions relate to your experience with alcohol, cigarettes, and other drugs",
    "If you get 3 or more when you add the answers to questions 23a and 23b",
    "One drink is 12 ounces of beer, 5 ounces of wine, or 1.5 ounces of 80-proof spirits."
]

In [None]:
# Convert PDF pages to images and extract text using OCR.

def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Converting PDF to images
        text = "\n".join([pytesseract.image_to_string(img, lang="eng") for img in images])
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None  # Skipping files that can't be processed

In [None]:
# Clean extracted text by removing unwanted symbols, spaces, and instructional texts.

def clean_text(text):
    text = re.sub(r"[\*+»~—]", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"Powered by Kipu Systems Page \d+ of \d+", "", text)

    for unwanted in UNWANTED_TEXTS:
        text = text.replace(unwanted, "")

    return text.strip()

In [None]:
def extract_questions_answers(text):

    # Starting from the first valid question
    start_section = "1. What is your living situation today?"
    if start_section in text:
        text = text.split(start_section, 1)[1]
        text = start_section + "\n" + text

    question_pattern = re.compile(r"(\d+)\.\s(.*?\?)\s*(.*?)(?=\n\d+\.|\Z)", re.DOTALL)

    questions = []
    answers = []

    for match in question_pattern.finditer(text):
        q_number, question, answer = match.groups()

        if int(q_number) > 26:
            break  # Stop at question 26

        question = clean_text(question.strip())
        answer = clean_text(answer.strip())

        # Handling Question 23 sub-questions correctly
        if q_number == "23":
            sub_questions = re.findall(r"(a\.)\s*(.*?)\?(.*?)\n(b\.)\s*(.*?)\?(.*?)", answer, re.DOTALL)
            if sub_questions:
                for sub_q in sub_questions:
                    questions.append(f"{question} {sub_q[1]}?")
                    answers.append(clean_text(sub_q[2]))

                    questions.append(f"{question} {sub_q[4]}?")
                    answers.append(clean_text(sub_q[5]))
                continue

        questions.append(question)
        answers.append(answer)

    return pd.DataFrame({"Question": questions, "Answer": answers})

In [None]:
text = extract_text_from_pdf(pdf_path)
if text:
    df = extract_questions_answers(text)

In [None]:
df

#Extraction using PDFPlumber

##2. Extraction from a single pdf

In [None]:
pip install pdfplumber

In [None]:
import pdfplumber

In [None]:
# Extract text from PDF using pdfplumber

def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

In [None]:
text = extract_text_from_pdf(pdf_path)
if text:
    df = extract_questions_answers(text)

In [None]:
df

###Tried manually entering 26 questions to create columns and also a pii which are extracted from thr filename to identify the patient.



In [None]:
import os

In [None]:
pdf_folder = "/content/drive/MyDrive/ahcm_redacted"

In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        # Check if questions exist
        if "1. What is your living situation today?" in text:
            print(f"Found Q&A section in {os.path.basename(pdf_path)}")
        else:
            print(f"No Q&A found in {os.path.basename(pdf_path)}. The questions might be on later pages.")

        return text.strip()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

In [None]:
def clean_text(text):
    text = re.sub(r"[\*+»~—]", "", text)
    text = re.sub(r"(\s)+", " ", text)
    text = re.sub(r"Powered by Kipu Systems Page \d+ of \d+", "", text)

    for unwanted in UNWANTED_TEXTS:
        text = text.replace(unwanted, "")

    return text.strip()

In [None]:
def extract_questions_answers(text):
    start_section = "1. What is your living situation today?"
    if start_section in text:
        text = text.split(start_section, 1)[1]
        text = start_section + "\n" + text
    else:
        print("No valid Q&A section found, skipping this PDF.")
        return None

    question_pattern = re.compile(r"(\d+)\.\s(.*?)\?(.*?)\n(?=\d+\.|\Z)", re.DOTALL)

    qa_dict = {question: None for question in QUESTION_COLUMNS}

    for match in question_pattern.finditer(text):
        q_number, question, answer = match.groups()
        question = question.strip()
        answer = answer.strip()

        if question in qa_dict:
            qa_dict[question] = answer

    return qa_dict

In [None]:
def extract_pii_from_filename(filename):
    match = re.search(r"redacted_(\w+)\.pdf", filename)
    return match.group(1) if match else None

In [None]:
def process_all_pdfs(pdf_folder):
    data = []

    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                answers_dict = extract_questions_answers(text)
                pii_value = extract_pii_from_filename(filename)
                answers_dict["PII"] = pii_value
                data.append(answers_dict)

    return pd.DataFrame(data, columns=["PII"] + QUESTION_COLUMNS)

In [None]:
df

In [None]:
df = process_all_pdfs(pdf_folder)

text = extract_text_from_pdf(pdf_path)

if text:
    print(f"📝 Extracted Q&A Text for {pdf_path}:\n{text[:1000]}...\n")  # Printing first 1000 characters
    extracted_data = extract_questions_answers(text)
    print(extracted_data)

The Data extraction is successful but currently trying to fix the dataframe issue as by debugging it is to be known that the data is stored as dictionary and just the value from those key:value pairs of each pdf's dictionary and store in the dataframe.

In [None]:
extracted_data

In [None]:
text

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a single PDF file."""
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        if "1. What is your living situation today?" in text:
            print(f"Found Q&A section in {os.path.basename(pdf_path)}")
        else:
            print(f"No Q&A found in {os.path.basename(pdf_path)}. It might be on later pages.")

        return text.strip()
    except Exception as e:
        print(f" Error processing {pdf_path}: {e}")
        return None

def extract_text_from_folder(pdf_folder):
    """Extracts text from all PDFs in a folder."""
    all_texts = {}  # Dictionary to store PDF filename -> extracted text

    # Loop through all PDFs in the folder
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            extracted_text = extract_text_from_pdf(pdf_path)
            if extracted_text:
                all_texts[pdf_file] = extracted_text

    return all_texts  # Returns a dictionary of {pdf_filename: extracted_text}

all_pdfs_text = extract_text_from_folder(pdf_folder)

print(f"\n Extracted text from {len(all_pdfs_text)} PDFs.")

In [None]:
all_pdfs_text

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a single PDF file."""
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        return text.strip()
    except Exception as e:
        print(f"⚠️ Error processing {pdf_path}: {e}")
        return None

def extract_qna_from_text(text):
    """Extracts Q&A pairs from the extracted text."""
    qa_dict = {}

    # Define question pattern (modify regex if needed)
    question_pattern = re.compile(r"(\d+\.\s.*?\?)\s*(.*)")

    # Extract Q&A pairs
    for match in question_pattern.finditer(text):
        question, answer = match.groups()
        qa_dict[question.strip()] = answer.strip() if answer else None  # Handle missing answers

    return qa_dict

def process_all_pdfs(pdf_folder):
    """Processes all PDFs in a folder, extracting Q&A from each."""
    all_qna_data = []

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):  # Ensure it's a PDF
            pdf_path = os.path.join(pdf_folder, pdf_file)
            extracted_text = extract_text_from_pdf(pdf_path)

            if extracted_text:  # Only process if text was extracted
                print(f"✅ Processing {pdf_file}")
                qna_dict = extract_qna_from_text(extracted_text)

                if qna_dict:  # Ensure we got valid Q&A pairs
                    qna_dict["Filename"] = pdf_file  # Add filename for reference
                    all_qna_data.append(qna_dict)
                else:
                    print(f"⚠️ No Q&A found in {pdf_file}")

    return all_qna_data

# 📂 Set your folder path
pdf_folder = "/content/drive/MyDrive/ahcm_redacted/"

# 🔄 Process all PDFs
qna_data_list = process_all_pdfs(pdf_folder)

# 📊 Convert to DataFrame
df = pd.DataFrame(qna_data_list)

# 🛠 Fill missing values with 'N/A' for consistency
df.fillna("N/A", inplace=True)



In [None]:
df

In [None]:
UNWANTED_TEXTS = [
    "Living Situation", "Food", "Transportation", "Utilities", "Safety",
    "Financial Strain", "Employment", "Family and Community Support", "Education",
    "Physical Activity", "Substance Use", "Mental Health", "Disabilities",
    "Choose all the apply",
    "Please answer whether the statements were OFTEN, SOMETIMES, or NEVER true for you and your household in the last 12 months.",
    "Calculate [“number of days” selected] x [“number of minutes” selected] = [number of minutes of exercise per week] 2. Apply the right age threshold: Under 6 years old: You can’t find the physical activity need for people under 6. Age 6 to 17: Less than an average of 60 minutes a day shows an HRSN. Age 18 or older: Less than 150 minutes a week shows an HRSN.",
    "Some people have made the following statements about their food situation",
    "Because violence and abuse happens to a lot of people and affects their health",
    "For example, starting or completing job training or getting a high school diploma, GED or equivalent.",
    "Point Total:()", "when the numerical values for answers to questions 3-10 are added shows that the person might not be safe.",
    "A score of 11 or more", "Follow these 2 steps to decide",
    "The next questions relate to your experience with alcohol, cigarettes, and other drugs",
    "If you get 3 or more when you add the answers to questions 23a and 23b",
    "One drink is 12 ounces of beer, 5 ounces of wine, or 1.5 ounces of 80-proof spirits."
]


def clean_text(text):
    """Cleans extracted text by removing unwanted characters and phrases."""
    text = re.sub(r"[\*+»~—]", "", text)  # Remove special characters
    text = re.sub(r"(\s)+", " ", text)  # Normalize spaces
    text = re.sub(r"Powered by Kipu Systems Page \d+ of \d+", "", text)  # Remove page numbers

    for unwanted in UNWANTED_TEXTS:
        text = text.replace(unwanted, "")

    return text.strip()

def extract_text_from_pdf(pdf_path):
    """Extracts text from a single PDF file."""
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        if "1. What is your living situation today?" in text:
            print(f"Found Q&A section in {os.path.basename(pdf_path)}")
        else:
            print(f"No Q&A found in {os.path.basename(pdf_path)}. It might be on later pages.")

        return text.strip()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

def extract_text_from_folder(pdf_folder):
    """Extracts text from all PDFs in a folder."""
    all_texts = {}  # Dictionary to store PDF filename -> extracted text

    # Loop through all PDFs in the folder
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            extracted_text = extract_text_from_pdf(pdf_path)
            if extracted_text:
                all_texts[pdf_file] = extracted_text

    return all_texts  # Returns a dictionary of {pdf_filename: extracted_text}

def clean_text(text):
    """Cleans extracted text (removes unnecessary spaces, newlines, etc.)."""
    return re.sub(r'\s+', ' ', text).strip()

def extract_questions_answers(text):
    """Extracts questions and answers from the extracted text."""
    # Starting from the first valid question
    start_section = "1. What is your living situation today?"
    if start_section in text:
        text = text.split(start_section, 1)[1]
        text = start_section + "\n" + text

    question_pattern = re.compile(r"(\d+)\.\s(.*?\?)\s*(.*?)(?=\n\d+\.|\Z)", re.DOTALL)

    questions = []
    answers = []

    for match in question_pattern.finditer(text):
        q_number, question, answer = match.groups()

        if int(q_number) > 26:
            break  # Stop at question 26

        question = clean_text(question.strip())
        answer = clean_text(answer.strip())

        # Handling Question 23 sub-questions correctly
        if q_number == "23":
            sub_questions = re.findall(r"(a\.)\s*(.*?)\?(.*?)\n(b\.)\s*(.*?)\?(.*?)", answer, re.DOTALL)
            if sub_questions:
                for sub_q in sub_questions:
                    questions.append(f"{question} {sub_q[1]}?")
                    answers.append(clean_text(sub_q[2]))

                    questions.append(f"{question} {sub_q[4]}?")
                    answers.append(clean_text(sub_q[5]))
                continue

        questions.append(question)
        answers.append(answer)

    return pd.DataFrame({"Question": questions, "Answer": answers})

def process_pdfs_in_folder(pdf_folder):
    """Processes all PDFs in a folder, extracting questions and answers."""
    all_pdfs_text = extract_text_from_folder(pdf_folder)

    all_results = {}

    for pdf_filename, text in all_pdfs_text.items():
        print(f"Extracting Q&A from {pdf_filename}...")
        df = extract_questions_answers(text)
        all_results[pdf_filename] = df

    return all_results



pdf_folder = "/content/drive/MyDrive/ahcm_redacted/"
all_pdf_data = process_pdfs_in_folder(pdf_folder)


for pdf_filename, df in all_pdf_data.items():
    print(f"\nExtracted Q&A from {pdf_filename}:")
    print(df.head())



In [None]:
print(all_pdf_data)

------


In [None]:
UNWANTED_TEXTS = [
    "Living Situation", "Food", "Transportation", "Utilities", "Safety",
    "Financial Strain", "Employment", "Family and Community Support", "Education",
    "Physical Activity", "Substance Use", "Mental Health", "Disabilities",
    "Choose all the apply",
    "Please answer whether the statements were OFTEN, SOMETIMES, or NEVER true for you and your household in the last 12 months.",
    "Calculate [“number of days” selected] x [“number of minutes” selected] = [number of minutes of exercise per week] 2. Apply the right age threshold: Under 6 years old: You can’t find the physical activity need for people under 6. Age 6 to 17: Less than an average of 60 minutes a day shows an HRSN. Age 18 or older: Less than 150 minutes a week shows an HRSN.",
    "Some people have made the following statements about their food situation",
    "Because violence and abuse happens to a lot of people and affects their health",
    "For example, starting or completing job training or getting a high school diploma, GED or equivalent.",
    "Point Total:()", "when the numerical values for answers to questions 3-10 are added shows that the person might not be safe.",
    "A score of 11 or more", "Follow these 2 steps to decide",
    "The next questions relate to your experience with alcohol, cigarettes, and other drugs",
    "If you get 3 or more when you add the answers to questions 23a and 23b",
    "One drink is 12 ounces of beer, 5 ounces of wine, or 1.5 ounces of 80-proof spirits."
]

def clean_text(text):
    """Cleans extracted text by removing unwanted characters and phrases."""
    text = re.sub(r"[\*+»~—]", "", text)
    text = re.sub(r"(\s)+", " ", text)
    text = re.sub(r"Powered by Kipu Systems Page \d+ of \d+", "", text)

    for unwanted in UNWANTED_TEXTS:
        text = text.replace(unwanted, "")

    return text.strip()


all_responses = []


for pdf_filename, df in all_pdf_data.items():
    cleaned_questions = df['Question'].apply(clean_text).tolist()
    cleaned_answers = df['Answer'].apply(clean_text).tolist()
    all_responses.append(cleaned_answers)

final_df = pd.DataFrame(all_responses, columns=cleaned_questions)
print(final_df)


In [None]:
final_df

In [None]:
final_df.to_csv("ahcm_output_data.csv", index=False)