In [1]:
pip install pdfplumber



In [2]:
import pdfplumber
import pandas as pd
import re
from pathlib import Path
import os
import warnings
import logging
warnings.filterwarnings("ignore")
logging.getLogger("pdfminer").setLevel(logging.ERROR)

In [3]:
def extract_pii_from_filename(filename):
    match = re.search(r"redacted_(\w+)\.pdf", filename)
    return match.group(1) if match else None

UNWANTED_TEXTS = [
    "Living Situation", "Food", "Transportation", "Utilities", "Safety",
    "Financial Strain", "Employment", "Family and Community Support", "Education",
    "Physical Activity", "Substance Use", "Mental Health", "Disabilities",
    "Choose all the apply",
    "Please answer whether the statements were OFTEN, SOMETIMES, or NEVER true for you and your household in the last 12 months.",
    "Calculate [“number of days” selected] x [“number of minutes” selected] = [number of minutes of exercise per week] 2. Apply the right age threshold: Under 6 years old: You can’t find the physical activity need for people under 6. Age 6 to 17: Less than an average of 60 minutes a day shows an HRSN. Age 18 or older: Less than 150 minutes a week shows an HRSN.",
    "Some people have made the following statements about their food situation",
    "Because violence and abuse happens to a lot of people and affects their health",
    "For example, starting or completing job training or getting a high school diploma, GED or equivalent.",
    "Point Total:()", "when the numerical values for answers to questions 3-10 are added shows that the person might not be safe.",
    "A score of 11 or more", "Follow these 2 steps to decide",
    "The next questions relate to your experience with alcohol, cigarettes, and other drugs",
    "If you get 3 or more when you add the answers to questions 23a and 23b",
    "One drink is 12 ounces of beer, 5 ounces of wine, or 1.5 ounces of 80-proof spirits."
]


def clean_text(text):
    """Cleans extracted text by removing unwanted characters and phrases."""
    text = re.sub(r"[\*+»~—]", "", text)  # Remove special characters
    text = re.sub(r"(\s)+", " ", text)  # Normalize spaces
    text = re.sub(r"Powered by Kipu Systems Page \d+ of \d+", "", text)  # Remove page numbers

    for unwanted in UNWANTED_TEXTS:
        text = text.replace(unwanted, "")

    return text.strip()

def extract_text_from_pdf(pdf_path):
    """Extracts text from a single PDF file."""
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        if "1. What is your living situation today?" in text:
            print(f"Found Q&A section in {os.path.basename(pdf_path)}")
        else:
            print(f"No Q&A found in {os.path.basename(pdf_path)}. It might be on later pages.")

        return text.strip()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

def extract_text_from_folder(pdf_folder):
    """Extracts text from all PDFs in a folder."""
    all_texts = {}  # Dictionary to store PDF filename -> extracted text

    # Loop through all PDFs in the folder
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            extracted_text = extract_text_from_pdf(pdf_path)
            if extracted_text:
                all_texts[pdf_file] = extracted_text

    return all_texts  # Returns a dictionary of {pdf_filename: extracted_text}


def extract_questions_answers(text):
    """Extracts questions and answers from the extracted text."""
    # Starting from the first valid question
    start_section = "1. What is your living situation today?"
    if start_section in text:
        text = text.split(start_section, 1)[1]
        text = start_section + "\n" + text

    question_pattern = re.compile(r"(\d+)\.\s(.*?\?)\s*(.*?)(?=\n\d+\.|\Z)", re.DOTALL)

    questions = []
    answers = []

    for match in question_pattern.finditer(text):
        q_number, question, answer = match.groups()

        if int(q_number) > 26:
            break  # Stop at question 26

        question = clean_text(question.strip())
        answer = clean_text(answer.strip())

        # Handling Question 23 sub-questions correctly
        if q_number == "23":
            sub_questions = re.findall(r"(a\.)\s*(.*?)\?(.*?)\n(b\.)\s*(.*?)\?(.*?)", answer, re.DOTALL)
            if sub_questions:
                for sub_q in sub_questions:
                    questions.append(f"{question} {sub_q[1]}?")
                    answers.append(clean_text(sub_q[2]))

                    questions.append(f"{question} {sub_q[4]}?")
                    answers.append(clean_text(sub_q[5]))
                continue

        questions.append(question)
        answers.append(answer)

    return pd.DataFrame({"Question": questions, "Answer": answers})

def process_pdfs_in_folder(pdf_folder):
    """Processes all PDFs in a folder, extracting questions and answers."""
    all_pdfs_text = extract_text_from_folder(pdf_folder)

    all_results = {}

    for pdf_filename, text in all_pdfs_text.items():
        print(f"Extracting Q&A from {pdf_filename}...")
        df = extract_questions_answers(text)
        all_results[pdf_filename] = df

    return all_results


pdf_folder = "/content/drive/MyDrive/ahcm_redacted/"
all_pdf_data = process_pdfs_in_folder(pdf_folder)


for pdf_filename in all_pdf_data.items():
    print(f"\nExtracted Q&A from {pdf_filename}:")


Found Q&A section in redacted_7917467f9754.pdf
Found Q&A section in redacted_d62c61917e2a.pdf
Found Q&A section in redacted_57c0bd5d694d.pdf
Found Q&A section in redacted_271e019eaca0.pdf
Found Q&A section in redacted_f6dd884b1bac.pdf
Found Q&A section in redacted_98e234ba5e17.pdf
Found Q&A section in redacted_ff837eea2798.pdf
Found Q&A section in redacted_70ea1972e7f9.pdf
Found Q&A section in redacted_5c5158c5a7d2.pdf
Found Q&A section in redacted_6fc48ee38a54.pdf
Found Q&A section in redacted_01cb6dae438c.pdf
Found Q&A section in redacted_a31e7df9c8ed.pdf
Found Q&A section in redacted_6ebe69c5f8a8.pdf
Found Q&A section in redacted_ceaee0523400.pdf
Found Q&A section in redacted_ec9ae72bd102.pdf
Found Q&A section in redacted_9fbaba2f4646.pdf
Found Q&A section in redacted_aca9ae161017.pdf
Found Q&A section in redacted_646e638b344c.pdf
Found Q&A section in redacted_df9d65c8a899.pdf
Found Q&A section in redacted_f9b9fc557c40.pdf
Found Q&A section in redacted_216266668847.pdf
Found Q&A sec

In [4]:
UNWANTED_TEXTS = [
    "Living Situation", "Food", "Transportation", "Utilities", "Safety",
    "Financial Strain", "Employment", "Family and Community Support", "Education",
    "Physical Activity", "Substance Use", "Mental Health", "Disabilities",
    "Choose all the apply",
    "Please answer whether the statements were OFTEN, SOMETIMES, or NEVER true for you and your household in the last 12 months.",
    "Calculate [“number of days” selected] x [“number of minutes” selected] = [number of minutes of exercise per week] 2. Apply the right age threshold: Under 6 years old: You can’t find the physical activity need for people under 6. Age 6 to 17: Less than an average of 60 minutes a day shows an HRSN. Age 18 or older: Less than 150 minutes a week shows an HRSN.",
    "Some people have made the following statements about their food situation",
    "Because violence and abuse happens to a lot of people and affects their health",
    "For example, starting or completing job training or getting a high school diploma, GED or equivalent.",
    "Point Total:()", "when the numerical values for answers to questions 3-10 are added shows that the person might not be safe.",
    "A score of 11 or more", "Follow these 2 steps to decide",
    "The next questions relate to your experience with alcohol, cigarettes, and other drugs",
    "If you get 3 or more when you add the answers to questions 23a and 23b",
    "One drink is 12 ounces of beer, 5 ounces of wine, or 1.5 ounces of 80-proof spirits."
]

def clean_text(text):
    """Cleans extracted text by removing unwanted characters and phrases."""
    text = re.sub(r"[\*+»~—]", "", text)
    text = re.sub(r"(\s)+", " ", text)
    text = re.sub(r"Powered by Kipu Systems Page \d+ of \d+", "", text)

    for unwanted in UNWANTED_TEXTS:
        text = text.replace(unwanted, "")

    return text.strip()



def extract_pii_from_filename(filename):
    match = re.search(r"redacted_(\w+)\.pdf", filename)
    return match.group(1) if match else filename  # fallback to filename if no match

# Extract only cleaned answers and the corresponding pii
rows = []

for pdf_filename, df in all_pdf_data.items():
    pii = extract_pii_from_filename(pdf_filename)
    cleaned_answers = df['Answer'].apply(clean_text).tolist()
    row = [pii] + cleaned_answers
    rows.append(row)

# Assuming all PDFs have the same question order → we use the first one to extract column names
first_pdf = next(iter(all_pdf_data.values()))
cleaned_questions = first_pdf['Question'].apply(clean_text).tolist()
columns = ['PII_ID'] + cleaned_questions

# Final DataFrame
final_df = pd.DataFrame(rows, columns=columns)


In [5]:
final_df

Unnamed: 0,PII_ID,What is your living situation today?,think about the place you live. Do you have problems with any of the following?,"Within the past 12 months, you worried that your food would run out before you got money to buy more. • Never true 4. Within the past 12 months, the food you bought just didn't last and you didn't have money to get more. • Never true 5. In the past 12 months, has lack of reliable transportation kept you from medical appointments, mettings, work or from getting things needed for daily living?","In the past 12 months has the electric, gas, oil, or water company threatened to shut off services in your home?","How often does anyone, including family and friends, physically hurt you?","How often does anyone, including family and friends, insult or talk down to you?","How often does anyone, including family and friends, threaten you with harm?","How often does anyone, including family and friends, scream or curse at you?","How hard is it for you to pay for the very basics like food, housing, medical care, and heating?",...,"In the last 30 days, other than the activities you did for work, on average, how many days per week did you engage in moderate exercise (like walking fast, running, jogging, dancing, swimming, biking, or other similar activities)?","On average, how many minutes did you usually spend exercising at this level on one of those days?","Calculate [“number of days” selected] x [“number of minutes” selected] = [number of minutes of exercise per week] 2. Apply the right age threshold: • Under 6 years old: You can’t find the physical activity need for people under 6. • Age 6 to 17: Less than an average of 60 minutes a day shows an HRSN. • Age 18 or older: Less than 150 minutes a week shows an HRSN. . Some of the substances are prescribed by a doctor (like pain medications), but only count those if you have taken them for reasons or in doses other than prescribed. One question is about illicit or illegal drug use, but we only ask in order to identify community services that may be available to help you. 19. How many times in the past 12 months have you had 5 or more drinks in a day (males) or 4 or more drinks in a day (females)?","How many times in the past 12 months have you used tobacco products (like cigarettes, cigars, snuff, chew, electronic cigarettes)?",How many times in the past year have you used prescription drugs for non-medical reasons?,How many times in the past year have you used illegal drugs?,"Over the past 2 weeks, how often have you been bothered by any of the following problems?","Stress means a situation in which a person feels tense, restless, nervous, or anxious, or is unable to sleep at night because his or her mind is troubled all the time. Do you feel this kind of stress these days?","Because of a physical, mental or emotional condition, do you have serious difficulty concentrating, remembering or making decisions?","Because of a physical, mental or emotional condition, do you have difficulty doing errands alone such as visiting a doctor's office or shopping?"
0,7917467f9754,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Never (1),Never (1),Never (1),Never (1) Point Total: (4),Would you say it is: • Not hard at all,...,• 2,• 10 if the person has a physical activity need:,• Daily or Almost Daily,• Once or Twice,• Never,• Never,a.) Little interest or pleasure in doing thing...,• Somewhat,(5 years old or older) • No,(15 years old or older) • No
1,d62c61917e2a,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Never (1),Never (1),Never (1),Never (1) Point Total: (4),Would you say it is: • Not hard at all,...,• 4,• 60 if the person has a physical activity need:,• Daily or Almost Daily,• Daily or Almost Daily,• Never,• Daily or Almost Daily,a.) Little interest or pleasure in doing thing...,• Quite a bit,(5 years old or older) • No,(15 years old or older) • No
2,57c0bd5d694d,• I do not have a steady place to live (I am t...,• None of the above .,• No,• No we are asking the following questions.,Never (1),Sometimes (3),Never (1),Sometimes (3) Point Total: (8),Would you say it is: • Very hard,...,• 4,• 60 if the person has a physical activity need:,• Never,• Once or Twice,• Never,• Never,a.) Little interest or pleasure in doing thing...,• Very much,(5 years old or older) • Yes,(15 years old or older) • No
3,271e019eaca0,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Never (1),Never (1),Never (1),Never (1) Point Total: (4),Would you say it is: • Not hard at all,...,• 2,• 90 if the person has a physical activity need:,• Daily or Almost Daily,• Never,• Never,• Never,a.) Little interest or pleasure in doing thing...,• Not at all,(5 years old or older) • No,(15 years old or older) • No
4,f6dd884b1bac,"• I have a place to live today, but I am worri...",• None of the above .,• No,• No we are asking the following questions.,Never (1),Rarely (2),Never (1),Rarely (2) Point Total: (6),Would you say it is: • Not hard at all,...,• 1,• 50 if the person has a physical activity need:,• Daily or Almost Daily,• Never,• Never,• Never,a.) Little interest or pleasure in doing thing...,• Quite a bit,(5 years old or older) • Yes,(15 years old or older) • Yes
5,98e234ba5e17,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Never (1),Fairly often (4),Never (1),Rarely (2) Point Total: (8),Would you say it is: • Not hard at all,...,• 5 • 6,• 60 • 90 if the person has a physical activit...,• Monthly,• Monthly,• Never,• Once or Twice,a.) Little interest or pleasure in doing thing...,• Somewhat,(5 years old or older) • Yes,(15 years old or older) • Yes
6,ff837eea2798,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Rarely (2),Fairly often (4),Rarely (2),Sometimes (3) Point Total: (11),Would you say it is: • Somewhat hard,...,• 5,• 40 if the person has a physical activity need:,• Daily or Almost Daily,• Daily or Almost Daily,• Never,• Never,a.) Little interest or pleasure in doing thing...,• Very much,(5 years old or older) • Yes,(15 years old or older) • Yes
7,70ea1972e7f9,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Never (1),Sometimes (3),Never (1),Rarely (2) Point Total: (7),Would you say it is: • Not hard at all,...,• 7,• 60 if the person has a physical activity need:,• Daily or Almost Daily,• Daily or Almost Daily,• Never,• Never,a.) Little interest or pleasure in doing thing...,• A little bit,(5 years old or older) • No,(15 years old or older) • No
8,5c5158c5a7d2,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Never (1),Sometimes (3),Never (1),Sometimes (3) Point Total: (8),Would you say it is: • Somewhat hard,...,• 7,• 60 if the person has a physical activity need:,• Daily or Almost Daily,• Daily or Almost Daily,• Daily or Almost Daily,• Daily or Almost Daily,a.) Little interest or pleasure in doing thing...,• Somewhat,(5 years old or older) • Yes,(15 years old or older) • No
9,6fc48ee38a54,• I have a steady place to live,• None of the above .,• No,• No we are asking the following questions.,Never (1),Fairly often (4),Never (1),Sometimes (3) Point Total: (9),Would you say it is: • Not hard at all,...,• 1,• 10 if the person has a physical activity need:,• Once or Twice,• Daily or Almost Daily,• Once or Twice,• Weekly,a.) Little interest or pleasure in doing thing...,• Very much,(5 years old or older) • Yes,(15 years old or older) • Yes


In [6]:
column_renames = {
    "What is your living situation today?": "Living_Situation",
    "think about the place you live. Do you have problems with any of the following?": "Housing_Issues",
    "Within the past 12 months, you worried that your food would run out before you got money to buy more. • Never true 4. Within the past 12 months, the food you bought just didn't last and you didn't have money to get more. • Never true 5. In the past 12 months, has lack of reliable transportation kept you from medical appointments, mettings, work or from getting things needed for daily living?": "Food_Transport_Issues",
    "In the past 12 months has the electric, gas, oil, or water company threatened to shut off services in your home?": "Utility_Shutoff_Threat",
    "How often does anyone, including family and friends, physically hurt you?": "Physical_Abuse_Frequency",
    "How often does anyone, including family and friends, insult or talk down to you?": "Verbal_Abuse_Frequency",
    "How often does anyone, including family and friends, threaten you with harm?": "Threats_Frequency",
    "How often does anyone, including family and friends, scream or curse at you?": "Screaming_Cursing_Frequency",
    "How hard is it for you to pay for the very basics like food, housing, medical care, and heating?": "Financial_Difficulty",
    "Do you want help finding or keeping work or a job?": "Job_Assistance",
    "If for any reason you need help with day-to-day activities such as bathing, preparing meals, shopping, managing finances, etc., do you get the help you need?": "Daily_Assistance_Need",
    "How often do you feel lonely or isolated from those around you?": "Loneliness_Frequency",
    "Do you speak a language other than English at home?": "Non_English_Home",
    "Do you want help with school or training?": "School_Training_Assistance",
    "In the last 30 days, other than the activities you did for work, on average, how many days per week did you engage in moderate exercise (like walking fast, running, jogging, dancing, swimming, biking, or other similar activities)?": "Exercise_Days_Per_Week",
    "On average, how many minutes did you usually spend exercising at this level on one of those days?": "Exercise_Minutes_Per_Day",
    "Calculate [“number of days” selected] x [“number of minutes” selected] = [number of minutes of exercise per week] 2. Apply the right age threshold: • Under 6 years old: You can’t find the physical activity need for people under 6. • Age 6 to 17: Less than an average of 60 minutes a day shows an HRSN. • Age 18 or older: Less than 150 minutes a week shows an HRSN. . Some of the substances are prescribed by a doctor (like pain medications), but only count those if you have taken them for reasons or in doses other than prescribed. One question is about illicit or illegal drug use, but we only ask in order to identify community services that may be available to help you. 19. How many times in the past 12 months have you had 5 or more drinks in a day (males) or 4 or more drinks in a day (females)?" : "Drink_Use_Frequency",
    "How many times in the past 12 months have you used tobacco products (like cigarettes, cigars, snuff, chew, electronic cigarettes)?": "Tobacco_Use_Frequency",
    "How many times in the past year have you used prescription drugs for non-medical reasons?": "Prescription_Drug_Misuse",
    "How many times in the past year have you used illegal drugs?": "Illegal_Drug_Use",
    "Over the past 2 weeks, how often have you been bothered by any of the following problems?": "Recent_Stress_Frequency",
    "Stress means a situation in which a person feels tense, restless, nervous, or anxious, or is unable to sleep at night because his or her mind is troubled all the time. Do you feel this kind of stress these days?": "Current_Stress_Level",
    "Because of a physical, mental or emotional condition, do you have serious difficulty concentrating, remembering or making decisions?": "Cognitive_Difficulty",
    "Because of a physical, mental or emotional condition, do you have difficulty doing errands alone such as visiting a doctor's office or shopping?": "Errand_Difficulty"
}


final_df = final_df.rename(columns={k: v for k, v in column_renames.items() if k in final_df.columns})


final_df = final_df.map(lambda x: x.strip().lower() if isinstance(x, str) else x)


categorical_columns = list(column_renames.values())
for col in categorical_columns:
    if col in final_df.columns:
        final_df[col] = final_df[col].astype("category")

In [7]:
 def clean_text(text):
     if pd.isnull(text):
         return text
     text = re.sub(r'•\s*', '', str(text))
     text = re.sub(r'powered by kipu systems.*?(?=page \d+ of \d+)?', '', text, flags=re.IGNORECASE)
     text = re.sub(r'page \d+ of \d+', '', text, flags=re.IGNORECASE)
     return text.strip()

 df_cleaned = final_df.applymap(clean_text)

In [8]:
df_cleaned

Unnamed: 0,PII_ID,Living_Situation,Housing_Issues,Food_Transport_Issues,Utility_Shutoff_Threat,Physical_Abuse_Frequency,Verbal_Abuse_Frequency,Threats_Frequency,Screaming_Cursing_Frequency,Financial_Difficulty,...,Exercise_Days_Per_Week,Exercise_Minutes_Per_Day,Drink_Use_Frequency,Tobacco_Use_Frequency,Prescription_Drug_Misuse,Illegal_Drug_Use,Recent_Stress_Frequency,Current_Stress_Level,Cognitive_Difficulty,Errand_Difficulty
0,7917467f9754,i have a steady place to live,none of the above .,no,no we are asking the following questions.,never (1),never (1),never (1),never (1) point total: (4),would you say it is: not hard at all,...,2,10 if the person has a physical activity need:,daily or almost daily,once or twice,never,never,a.) little interest or pleasure in doing thing...,somewhat,(5 years old or older) no,(15 years old or older) no
1,d62c61917e2a,i have a steady place to live,none of the above .,no,no we are asking the following questions.,never (1),never (1),never (1),never (1) point total: (4),would you say it is: not hard at all,...,4,60 if the person has a physical activity need:,daily or almost daily,daily or almost daily,never,daily or almost daily,a.) little interest or pleasure in doing thing...,quite a bit,(5 years old or older) no,(15 years old or older) no
2,57c0bd5d694d,i do not have a steady place to live (i am tem...,none of the above .,no,no we are asking the following questions.,never (1),sometimes (3),never (1),sometimes (3) point total: (8),would you say it is: very hard,...,4,60 if the person has a physical activity need:,never,once or twice,never,never,a.) little interest or pleasure in doing thing...,very much,(5 years old or older) yes,(15 years old or older) no
3,271e019eaca0,i have a steady place to live,none of the above .,no,no we are asking the following questions.,never (1),never (1),never (1),never (1) point total: (4),would you say it is: not hard at all,...,2,90 if the person has a physical activity need:,daily or almost daily,never,never,never,a.) little interest or pleasure in doing thing...,not at all,(5 years old or older) no,(15 years old or older) no
4,f6dd884b1bac,"i have a place to live today, but i am worried...",none of the above .,no,no we are asking the following questions.,never (1),rarely (2),never (1),rarely (2) point total: (6),would you say it is: not hard at all,...,1,50 if the person has a physical activity need:,daily or almost daily,never,never,never,a.) little interest or pleasure in doing thing...,quite a bit,(5 years old or older) yes,(15 years old or older) yes
5,98e234ba5e17,i have a steady place to live,none of the above .,no,no we are asking the following questions.,never (1),fairly often (4),never (1),rarely (2) point total: (8),would you say it is: not hard at all,...,5 6,60 90 if the person has a physical activity need:,monthly,monthly,never,once or twice,a.) little interest or pleasure in doing thing...,somewhat,(5 years old or older) yes,(15 years old or older) yes
6,ff837eea2798,i have a steady place to live,none of the above .,no,no we are asking the following questions.,rarely (2),fairly often (4),rarely (2),sometimes (3) point total: (11),would you say it is: somewhat hard,...,5,40 if the person has a physical activity need:,daily or almost daily,daily or almost daily,never,never,a.) little interest or pleasure in doing thing...,very much,(5 years old or older) yes,(15 years old or older) yes
7,70ea1972e7f9,i have a steady place to live,none of the above .,no,no we are asking the following questions.,never (1),sometimes (3),never (1),rarely (2) point total: (7),would you say it is: not hard at all,...,7,60 if the person has a physical activity need:,daily or almost daily,daily or almost daily,never,never,a.) little interest or pleasure in doing thing...,a little bit,(5 years old or older) no,(15 years old or older) no
8,5c5158c5a7d2,i have a steady place to live,none of the above .,no,no we are asking the following questions.,never (1),sometimes (3),never (1),sometimes (3) point total: (8),would you say it is: somewhat hard,...,7,60 if the person has a physical activity need:,daily or almost daily,daily or almost daily,daily or almost daily,daily or almost daily,a.) little interest or pleasure in doing thing...,somewhat,(5 years old or older) yes,(15 years old or older) no
9,6fc48ee38a54,i have a steady place to live,none of the above .,no,no we are asking the following questions.,never (1),fairly often (4),never (1),sometimes (3) point total: (9),would you say it is: not hard at all,...,1,10 if the person has a physical activity need:,once or twice,daily or almost daily,once or twice,weekly,a.) little interest or pleasure in doing thing...,very much,(5 years old or older) yes,(15 years old or older) yes


In [9]:
def clean_frequency(val):
    if pd.isnull(val):
        return val
    val = str(val).strip().lower()
    keywords = ['never', 'rarely', 'sometimes', 'often', 'very often', 'frequently']
    for word in keywords:
        if word in val:
            return word.capitalize()
    return val.capitalize()  # fallback

frequency_columns = ['Verbal_Abuse_Frequency', 'Screaming_Cursing_Frequency',
                     'Tobacco_Use_Frequency', 'Recent_Stress_Frequency']
for col in frequency_columns:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].apply(clean_frequency)

In [10]:
def extract_number(val):
    if pd.isnull(val):
        return None
    match = re.search(r'\d+', str(val))
    return float(match.group()) if match else None


numeric_text_columns = ['Exercise_Minutes_Per_Day', 'Exercise_Days_Per_Week']

for col in numeric_text_columns:
    if col in df.columns:
        df_cleaned[col] = df_cleaned[col].apply(extract_number)

In [11]:
yn_columns = ['Prescription_Drug_Misuse', 'Illegal_Drug_Use', 'Utility_Shutoff_Threat',
              'Housing_Issues', 'Cognitive_Difficulty', 'Errand_Difficulty',
              'Financial_Difficulty']

def clean_yes_no(val):
    if pd.isnull(val):
        return val
    val = str(val).strip().lower()
    if 'yes' in val:
        return 'Yes'
    elif 'no' in val:
        return 'No'
    elif 'don' in val:
        return "Don't Know"
    return val.capitalize()

for col in yn_columns:
    if col in df.columns:
        df_cleaned[col] = df_cleaned[col].apply(clean_yes_no)

In [12]:
for col in df_cleaned.select_dtypes(include='object').columns:
    df_cleaned[col] = df_cleaned[col].astype('category')

In [13]:
df = df_cleaned

In [14]:
housing_map = {
    'i have a steady place to live': 'Stable housing',
    'i have a place to live today, but i am worried about losing it in the future': 'Unstable housing',
    'i do not have a steady place to live (i am temporarily staying with others, in a hotel, in a shelter, living outside on the street, on a beach, in a car, abandoned building, bus or train station, or in a park)': 'Homeless or temporary'
}
df['Living_Situation'] = df['Living_Situation'].astype(str).str.lower().map(housing_map).fillna(df['Living_Situation'])
df['Housing_Issues'] = df['Housing_Issues'].astype(str).str.replace(r'\s*\.$', '', regex=True)
df['Food_Transport_Issues'] = df['Food_Transport_Issues'].str.replace(r'(?i)utilities', '', regex=True)
df['Utility_Shutoff_Threat'] = df['Utility_Shutoff_Threat'].str.replace(r'(?i)we are asking the following questions\.?', '', regex=True).str.strip()
df['Financial_Difficulty'] = df['Financial_Difficulty'].str.replace(r'(?i)would you say it is:', '', regex=True)
df['Job_Assistance'] = df['Job_Assistance'].str.replace(r'(?i)family and community support', '', regex=True)
df['Loneliness_Frequency'] = df['Loneliness_Frequency'].str.replace(r'(?i)education', '', regex=True)
df['School_Training_Assistance'] = df['School_Training_Assistance'].str.replace(r"(?i)for example, starting or completing job training or getting a high school diploma, ged or equivalent\.|physical activity", "", regex=True)
df['Drink_Use_Frequency'] = df['Drink_Use_Frequency'].str.replace(r"(?i)one drink is 12 ounces of beer, 5 ounces of wine, or 1\.5 ounces of 80-proof spirits\.", "", regex=True)
df['Illegal_Drug_Use'] = df['Illegal_Drug_Use'].str.replace("(?i)mental health", "", regex=True).str.strip()
df['Exercise_Minutes_Per_Day'] = df['Exercise_Minutes_Per_Day'].str.replace(r'(?i)if the person has a physical activity need:', '', regex=True)
df['Current_Stress_Level'] = df['Current_Stress_Level'].str.replace(r'(?i)disabilities', '', regex=True)
df['Recent_Stress_Frequency'] = df['Recent_Stress_Frequency'].apply(lambda x: ' '.join(re.findall(r'\(\d+\)', str(x))))


phrases_to_remove = [
    r"choose all the apply",
    r"food some people have made the following statements about their food situation.*?true for you and your household in the last 12 months\.",
    r"utilities",
    r" safety because violence and abuse happens to a lot of people.*?we are asking the following questions\.",
    r"a score of 11 or more when the numerical values for answers to questions 3-10 are added shows that the person might not be safe\. financial strain",
    r"would you say it is:",
    r"family and community support",
    r"education",
    r"for example, starting or completing job training or getting a high school diploma, ged or equivalent\.",
    r"physical activity",
    r"follow these 2 steps to decide if the person has a physical activity need:",
    r"one drink is 12 ounces of beer, 5 ounces of wine, or 1.5 ounces of 80-proof spirits.",
    r"mental health",
    r"if you get 3 or more when you add the answers to questions 23a and 23b the person may have a mental health need\.",
    r"disabilities",
    r"\(5 years old or older\)",
    r"\(15 years old or older\)",
    r"powered by kipu systems page \d of \d"
]

pattern = re.compile("|".join(phrases_to_remove), flags=re.IGNORECASE)

def clean_and_extract(cell):
    if isinstance(cell, str):
        cleaned = pattern.sub("", cell)
        numbers = re.findall(r"\((\d+)\)", cleaned)
        if numbers:
            return int(numbers[-1])
        return cleaned.strip()
    return cell

df_cleaned = df.applymap(clean_and_extract)

df_cleaned.dropna(how="all", inplace=True)
df_cleaned.dropna(axis=1, how="all", inplace=True)


In [15]:
df_cleaned

Unnamed: 0,PII_ID,Living_Situation,Housing_Issues,Food_Transport_Issues,Utility_Shutoff_Threat,Physical_Abuse_Frequency,Verbal_Abuse_Frequency,Threats_Frequency,Screaming_Cursing_Frequency,Financial_Difficulty,...,Exercise_Days_Per_Week,Exercise_Minutes_Per_Day,Drink_Use_Frequency,Tobacco_Use_Frequency,Prescription_Drug_Misuse,Illegal_Drug_Use,Recent_Stress_Frequency,Current_Stress_Level,Cognitive_Difficulty,Errand_Difficulty
0,7917467f9754,Stable housing,none of the above,no,no,1,Never,1,Never,not hard at all,...,2,10,daily or almost daily,Once or twice,never,never,0,somewhat,no,no
1,d62c61917e2a,Stable housing,none of the above,no,no,1,Never,1,Never,not hard at all,...,4,60,daily or almost daily,Daily or almost daily,never,daily or almost daily,1,quite a bit,no,no
2,57c0bd5d694d,Homeless or temporary,none of the above,no,no,1,Sometimes,1,Sometimes,very hard,...,4,60,never,Once or twice,never,never,6,very much,yes,no
3,271e019eaca0,Stable housing,none of the above,no,no,1,Never,1,Never,not hard at all,...,2,90,daily or almost daily,Never,never,never,0,not at all,no,no
4,f6dd884b1bac,Unstable housing,none of the above,no,no,1,Rarely,1,Rarely,not hard at all,...,1,50,daily or almost daily,Never,never,never,6,quite a bit,yes,yes
5,98e234ba5e17,Stable housing,none of the above,no,no,1,Often,1,Rarely,not hard at all,...,5 6,60 90,monthly,Monthly,never,once or twice,2,somewhat,yes,yes
6,ff837eea2798,Stable housing,none of the above,no,no,2,Often,2,Sometimes,somewhat hard,...,5,40,daily or almost daily,Daily or almost daily,never,never,3,very much,yes,yes
7,70ea1972e7f9,Stable housing,none of the above,no,no,1,Sometimes,1,Rarely,not hard at all,...,7,60,daily or almost daily,Daily or almost daily,never,never,0,a little bit,no,no
8,5c5158c5a7d2,Stable housing,none of the above,no,no,1,Sometimes,1,Sometimes,somewhat hard,...,7,60,daily or almost daily,Daily or almost daily,daily or almost daily,daily or almost daily,0,somewhat,yes,no
9,6fc48ee38a54,Stable housing,none of the above,no,no,1,Often,1,Sometimes,not hard at all,...,1,10,once or twice,Daily or almost daily,once or twice,weekly,4,very much,yes,yes


In [16]:
df_cleaned.to_csv("ahcm_output_data.csv", index=False)

----
