In [1]:
import fitz
import re
import os
import PyPDF2
import pandas as pd
from datetime import datetime
import logging

In [2]:
# Set up basic logging
logging.basicConfig(level=logging.DEBUG)

In [3]:
# Define paths
input_folder = "./bps/raw_bps_pdf"
#output_folder = "./bps/bps_redacted"

In [4]:
# Define the start and end phrases for capturing the motivations
start_phrase = "X. TREATMENT ACCEPTANCE / RESISTANCE DIMENSION"
end_phrase = "3. Relapse/Continued Use Potential"

# Initialize an empty list to store results
data = []

# column names for the biopsychosocial scores
columns = [
    "group_identifier", "bps_problems", "bps_medical", "bps_employment", "bps_peer_support",
    "bps_drug_alcohol", "bps_legal", "bps_family", "bps_mh", "bps_total"
]

In [5]:
# extract the date after "Biopsychosocial Assessment"
def extract_assessment_date(text):
    # Look for the phrase and try to extract the date
    match = re.search(r"Biopsychosocial Assessment\s*(\d{1,2}/\d{1,2}/\d{4})", text)
    if match:
        return match.group(1)
    else:
        return None

In [6]:
# extact date of birth after "Birthdate:"
def extract_birthdate(text):
    match = re.search(r"Birthdate:\s*(\d{1,2}/\d{1,2}/\d{4})", text)
    if match:
        return match.group(1)
    else:
        return None

In [7]:
# Function to calculate age from birthdate and assessment date
def calculate_age(assessment_date, birthdate):
    # Convert both dates to datetime objects
    assessment_date = datetime.strptime(assessment_date, "%m/%d/%Y")
    birthdate = datetime.strptime(birthdate, "%m/%d/%Y")
    
    # Calculate the difference in years
    age = (assessment_date - birthdate).days // 365
    return age

In [8]:
# Function to extract external and internal motivations from a PDF
def extract_motivations(pdf_path):
    doc = fitz.open(pdf_path)
    extracted_text = ""

    # Loop through all pages
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text("text")

        # Find the start and end phrase positions
        if start_phrase in text and end_phrase in text:
            start_index = text.find(start_phrase) + len(start_phrase)
            end_index = text.find(end_phrase)
            
            # Extract the text in between
            extracted_text = text[start_index:end_index].strip()
            break

    # Split the extracted text into lines
    lines = extracted_text.splitlines()

    # Initialize external and internal motivation variables
    external_motivation = None
    internal_motivation = None

    # We will look at the first two lines
    if len(lines) > 0:
        external_motivation = lines[0].strip()  # First line
    if len(lines) > 1:
        internal_motivation = lines[1].strip()  # Second line

    return external_motivation, internal_motivation

In [9]:
# Function to extract the drug craving score following the specific text
def extract_drug_craving_score(text):
    # Search for the specific phrase "(Range 0-10, 10 being highest)" followed by the number (with or without /10)
    match = re.search(r'\(Range 0-10, 10 being highest\)\s*(\d+)(?:/10)?', text)
    if match:
        # Return the captured number
        return int(match.group(1))
    else:
        return None

In [10]:
# Function to extract scores from the text
def extract_bps_scores(text):
    # Regex to find numbers inside parentheses (captures the first 9 scores)
    pattern = r"\((\d+)\)"  # Match numbers inside parentheses
    matches = re.findall(pattern, text)

    # Debugging: Check the found matches
    #logging.debug(f"Found matches: {matches}")

    # Ensure we have exactly 9 valid matches (ignore extra ones like the total score)
    if len(matches) >= 9:
        scores = {columns[i+1]: int(matches[i]) for i in range(9)}  # First 9 matches map to columns
        scores["bps_total"] = int(matches[-1])  # The last one is the total score
        return scores
    else:
        logging.warning(f"Unexpected number of matches: {len(matches)}. Expected 9.")
        return {col: None for col in columns[1:]}  # Return None for each column if not 9 matches

In [11]:
# Function to extract the "List Drugs of Choice" answer
def extract_drugs_of_choice(text):
    # Look for the phrase "List Drugs of Choice:" and extract everything that follows it
    match = re.search(r"List Drugs of Choice:\s*(.*?)(?=\n\S|$)", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return None

In [12]:
# Function to extract the number of previous treatments
def extract_num_prev_treatments(text):
    # Search for the phrase "Number of Times:" followed by a number (with or without text after it)
    match = re.search(r'Number of Times:\s*(\d+)', text)
    if match:
        return int(match.group(1))  # Convert the extracted number to an integer
    else:
        return None

In [13]:
# Iterate through all files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".pdf"):
        # Extract group identifier from filename (format is bps_xxxxxx.pdf)
        group_identifier = filename.split('_')[1].replace('.pdf', '')

        # PDF file path
        pdf_path = os.path.join(input_folder, filename)

        # external and internal motivations
        ext_motivation, int_motivation = extract_motivations(pdf_path)

        # text from the PDF for score extraction
        doc = fitz.open(pdf_path)
        extracted_text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            extracted_text += page.get_text("text")

        # Extract scores from the text
        bps_scores = extract_bps_scores(extracted_text)

        # Extract the assessment date/birthdate
        assessment_date = extract_assessment_date(extracted_text)
        birthdate = extract_birthdate(extracted_text)

        # Calculate the age if both dates are available
        if assessment_date and birthdate:
            age = calculate_age(assessment_date, birthdate)
        else:
            age = None

        # Extract the drugs of choice
        drugs_of_choice = extract_drugs_of_choice(extracted_text)

        #drug craving score
        drug_craving_score = extract_drug_craving_score(extracted_text)

        # num of treatments
        num_prev_treatments = extract_num_prev_treatments(extracted_text)

        # Combine the extracted data into a single dictionary
        result = {
            "group_identifier": group_identifier,
            "assmt_dt": assessment_date,
            "birthdate": birthdate,
            "age": age,
            "ext_motivation": ext_motivation,
            "int_motivation": int_motivation,
            "num_prev_treatments": num_prev_treatments,
            "drugs_of_choice": drugs_of_choice,
            "drug_craving_score": drug_craving_score
            
        }
        result.update(bps_scores)  # Add the scores to the result

        # Append the result to the data list
        data.append(result)

In [14]:
df = pd.DataFrame(data)

In [15]:
pwd

'/Users/ethanpanal/Documents/UP/Capstone'

In [16]:
df.to_csv('bps_anonimized.csv')