In [27]:
import fitz
import re
import os
import PyPDF2
import pandas as pd
import pdfplumber
from datetime import datetime

In [28]:
pwd

'/Users/ethanpanal/Documents/UP/Capstone'

In [29]:
# Define paths
input_folder = "./bps/raw_bps_pdf"
output_folder = "./bps/bps_redacted"

In [30]:
# Define the start and end phrases
start_phrase = "X. TREATMENT ACCEPTANCE / RESISTANCE DIMENSION"
end_phrase = "3. Relapse/Continued Use Potential"

# Initialize an empty list to store results
data = []

# Define the column names for motivations and the scores
columns = [
    "group_identifier", "bps_problems", "bps_medical", "bps_employment", "bps_peer_support",
    "bps_drug_alcohol", "bps_legal", "bps_family", "bps_mh", "bps_total"
]

In [31]:
# Function to extract the date after "Biopsychosocial Assessment"
def extract_assessment_date(text):
    # Look for the phrase and try to extract the date
    match = re.search(r"Biopsychosocial Assessment\s*(\d{1,2}/\d{1,2}/\d{4})", text)
    if match:
        return match.group(1)
    else:
        return None

In [32]:
# Function to extract the birthdate after "Birthdate:"
def extract_birthdate(text):
    # Look for the phrase "Birthdate:" and extract the following date
    match = re.search(r"Birthdate:\s*(\d{1,2}/\d{1,2}/\d{4})", text)
    if match:
        return match.group(1)
    else:
        return None

In [33]:
# Function to calculate age from birthdate and assessment date
def calculate_age(assessment_date, birthdate):
    # Convert both dates to datetime objects
    assessment_date = datetime.strptime(assessment_date, "%m/%d/%Y")
    birthdate = datetime.strptime(birthdate, "%m/%d/%Y")
    
    # Calculate the difference in years
    age = (assessment_date - birthdate).days // 365
    return age

In [34]:
# Function to extract external and internal motivations from a PDF
def extract_motivations(pdf_path):
    doc = fitz.open(pdf_path)
    extracted_text = ""

    # Loop through all pages
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text("text")

        # Find the start and end phrase positions
        if start_phrase in text and end_phrase in text:
            start_index = text.find(start_phrase) + len(start_phrase)
            end_index = text.find(end_phrase)
            
            # Extract the text in between
            extracted_text = text[start_index:end_index].strip()
            break

    # Split the extracted text into lines
    lines = extracted_text.splitlines()

    # Initialize external and internal motivation variables
    external_motivation = None
    internal_motivation = None

    # We will look at the first two lines
    if len(lines) > 0:
        external_motivation = lines[0].strip()  # First line
    if len(lines) > 1:
        internal_motivation = lines[1].strip()  # Second line

    return external_motivation, internal_motivation

In [35]:
# Function to extract the drug craving score following the specific text
def extract_drug_craving_score(text):
    # Search for the specific phrase "(Range 0-10, 10 being highest)" followed by the number (with or without /10)
    match = re.search(r'\(Range 0-10, 10 being highest\)\s*(\d+)(?:/10)?', text)
    if match:
        # Return the captured number
        return int(match.group(1))
    else:
        return None

In [42]:
def extract_bps_scores(text):
    # Updated regex pattern to handle descriptions
    pattern = r"(\d+)\s*–\s*[\w\s]+\((\d+)\)|\((\d+)\)"
    matches = re.findall(pattern, text)

    scores = []
    for match in matches:
        # Extract the correct value (ignoring empty capture groups)
        score = next((int(x) for x in match if x.isdigit()), None)
        scores.append(score)

    # Ensure we have the correct number of matches
    if len(scores) == 9:  # Expecting 9 categories
        scores_dict = {columns[i+1]: scores[i] for i in range(9)}
        scores_dict["bps_total"] = scores[-1]  # Last one is the total
        return scores_dict
    else:
        print(f"Warning: Found {len(scores)} matches instead of 9 in text:\n{text}")
        return {col: None for col in columns[1:]}

In [43]:
# Function to extract the "List Drugs of Choice" answer
def extract_drugs_of_choice(text):
    # Look for the phrase "List Drugs of Choice:" and extract everything that follows it
    match = re.search(r"List Drugs of Choice:\s*(.*?)(?=\n\S|$)", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return None

In [44]:
# Function to extract the number of previous treatments
def extract_num_prev_treatments(text):
    # Search for the phrase "Number of Times:" followed by a number (with or without text after it)
    match = re.search(r'Number of Times:\s*(\d+)', text)
    if match:
        return int(match.group(1))  # Convert the extracted number to an integer
    else:
        return None

In [45]:
# Iterate through all files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".pdf"):
        # Extract group identifier from filename (assuming the format is bps_xxxxxx.pdf)
        group_identifier = filename.split('_')[1].replace('.pdf', '')

        # Get the full PDF file path
        pdf_path = os.path.join(input_folder, filename)

        # Extract external and internal motivations
        ext_motivation, int_motivation = extract_motivations(pdf_path)

        # Extract text from the PDF for score extraction
        doc = fitz.open(pdf_path)
        extracted_text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            extracted_text += page.get_text("text")

        # Extract scores from the text
        bps_scores = extract_bps_scores(extracted_text)

        # Extract the assessment date/birthdate
        assessment_date = extract_assessment_date(extracted_text)
        birthdate = extract_birthdate(extracted_text)

        # Calculate the age if both dates are available
        if assessment_date and birthdate:
            age = calculate_age(assessment_date, birthdate)
        else:
            age = None

        # Extract the drugs of choice
        drugs_of_choice = extract_drugs_of_choice(extracted_text)

        #drug craving score
        drug_craving_score = extract_drug_craving_score(extracted_text)

        # num of treatments
        num_prev_treatments = extract_num_prev_treatments(extracted_text)

        # Combine the extracted data into a single dictionary
        result = {
            "group_identifier": group_identifier,
            "assmt_dt": assessment_date,
            "birthdate": birthdate,
            "age": age,
            "ext_motivation": ext_motivation,
            "int_motivation": int_motivation,
            "num_prev_treatments": num_prev_treatments,
            "drugs_of_choice": drugs_of_choice,
            "drug_craving_score": drug_craving_score
            
        }
        result.update(bps_scores)  # Add the scores to the result

        # Append the result to the data list
        data.append(result)

Location: EC Laguna Outpatient
(GMT-08:00) Pacific Time (US & Canada)
Date/Time:
Start time
06/18/2024
10:01 AM
End time
06/18/2024
11:31 AM
Duration
90
Minutes
1. Where were you raised and by whom?
2. Do you have any siblings?
3. How were the relationships between family members in the immediate family/in the household?
4. Who do you feel closest to in the family and why?
Mother:
None
Father:
Substance Abuse
Step-Parent:
None
Siblings:
Substance Abuse
Other:
None
If YES to any of the above, elaborate:
1. Are you involved in a significant relationship?
If YES, are you satisfied with relationship with partner?
2. Marriage History: ☑ None
Stefanee "Stefanee" Cardinale ♀ LO-2024-6 
Pronouns: She/Her
Birthdate: 11/19/1994
Allergies: No Known Allergies/NKA
Admission: 06/18/2024  Care Team
Biopsychosocial Assessment 06/18/2024 10:01 AM
I. SOCIAL AREA
A. Family of Origin
"California, mom, dad and siblings and my two half sisters."
Name
Age
Grew Up Together?
Adrian Half sister
36
yes
Christa 

In [46]:
df = pd.DataFrame(data)

In [47]:
df

Unnamed: 0,group_identifier,assmt_dt,birthdate,age,ext_motivation,int_motivation,num_prev_treatments,drugs_of_choice,drug_craving_score,bps_problems,bps_medical,bps_employment,bps_peer_support,bps_drug_alcohol,bps_legal,bps_family,bps_mh,bps_total
0,f6dd884b1bac,10/08/2024,10/24/1970,53,"""I can't continue living the way I am, I've be...","""I don't want to keep giving up. I don't want ...",0.0,"""alcohol and marijuana""",5.0,3.0,0.0,2.0,1.0,3.0,0.0,2.0,3.0,14.0
1,98e234ba5e17,01/07/2025,10/22/2003,21,better communciation with others,"coping skills, help manage my body",1.0,weed,0.0,3.0,1.0,1.0,3.0,0.0,0.0,3.0,4.0,15.0
2,271e019eaca0,10/07/2024,10/09/1973,51,,,3.0,"""vodka basically""",,3.0,2.0,3.0,2.0,4.0,0.0,0.0,2.0,16.0
3,d62c61917e2a,06/18/2024,11/19/1994,29,"""I want to for relationships around me and my ...","""I want to have some peace with who I am and t...",1.0,Alcohol & Marijuana,0.0,,,,,,,,,
4,57c0bd5d694d,09/19/2024,01/14/1995,29,"""I don't want to be moving all the time. I wan...",in a stable place where I can be there for my ...,0.0,na,0.0,3.0,1.0,3.0,3.0,1.0,0.0,3.0,3.0,17.0
5,c701eaecb601,08/15/2024,12/20/1991,32,"To live a better life be happy, and have fun w...",Just to be happy for once and not have a fake ...,10.0,"Xanax, coke and meth",0.0,2.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,25.0
6,6fc48ee38a54,02/14/2025,05/08/1994,30,"becoming financially independent, just having ...",I want to be proud of who I am,5.0,adderall/weed,0.0,3.0,0.0,3.0,2.0,3.0,0.0,4.0,3.0,18.0
7,01cb6dae438c,11/29/2024,09/20/1996,28,Family & friends,"I want to have a good peaceful life, connected...",5.0,,0.0,1.0,0.0,3.0,1.0,0.0,0.0,3.0,4.0,12.0
8,6ebe69c5f8a8,01/21/2025,01/11/1994,31,"my family, my friends",desire to get career back,8.0,"fentanyl, cocaine",1.0,4.0,4.0,3.0,1.0,4.0,4.0,1.0,2.0,23.0
9,a31e7df9c8ed,07/22/2024,07/08/2004,20,"""family and my health and my future""","""my mental health and wanting to feel better""",,Alcohol,0.0,3.0,1.0,3.0,3.0,3.0,0.0,3.0,3.0,19.0


In [48]:
# Define expected column names 
expected_columns = [
    "Drug","First Used", "Last Used", "Frequency/Duration", "Amount", 
    "Method", "Pattern of Use"
]

with pdfplumber.open("./bps/raw_bps_pdf/bps_df9d65c8a899.pdf") as pdf:
    page = pdf.pages[4]  # Extract from the first page (change index if needed)
    table = page.extract_table()

    if table and len(table) > 1:  # Ensure the table isn't empty and has rows
        df_table = pd.DataFrame(table[1:], columns=table[0])  # First row as headers

    # If extracted headers don't match expected ones, manually assign column names
    if not all(col in expected_columns for col in df_table.columns):
        df_table.columns = expected_columns
    else:
        df_table = pd.DataFrame(columns=expected_columns)  # Empty DataFrame with correct columns

# Convert extracted table to a DataFrame
df2 = pd.DataFrame(df_table)

In [53]:
doc2 = fitz.open('./bps/raw_bps_pdf/bps_271e019eaca0.pdf')
extracted_text = ""
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    extracted_text += page.get_text("text")


In [56]:
print(extracted_text)

Location: EC Laguna Outpatient
(GMT-08:00) Pacific Time (US & Canada)
Date/Time:
Start time
05/09/2024
10:22 AM
End time
05/09/2024
11:52 AM
Duration
90
Minutes
1. Where were you raised and by whom?
2. Do you have any siblings?
3. How were the relationships between family members in the immediate family/in the household?
4. Who do you feel closest to in the family and why?
Mother:
Substance Abuse
Father:
Substance Abuse
Step-Parent:
None
Siblings:
None
Other:
None
If YES to any of the above, elaborate:
1. Are you involved in a significant relationship?
If YES, are you satisfied with relationship with partner?
2. Marriage History:
Robert "Bobby" Heredia ♂ LO-2024-4 
Pronouns: He/Him
Birthdate: 01/09/1990
Allergies: Ativan
Admission: 05/09/2024  Care Team
Biopsychosocial Assessment 05/09/2024 10:22 AM
I. SOCIAL AREA
A. Family of Origin
Clt shared, "I was raised in Palmdale by mom and dad and my older sister"
Name
Age
Grew Up Together?
Daniel
36
yes
"I had a great childhood not a bad lif