In [1]:
import fitz
import re
import os
import PyPDF2
import pandas as pd
import pdfplumber
from datetime import datetime


In [2]:
# Define paths
input_folder = "./bps/raw_bps_pdf"
output_folder = "./bps/bps_redacted"

In [3]:
data = []

In [4]:
# Iterate through all files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".pdf"):
        group_identifier = filename.split('_')[1].replace('.pdf', '')
        pdf_path = os.path.join(input_folder, filename)

        # Extract full text for score extraction (with PyMuPDF)
        doc = fitz.open(pdf_path)
        extracted_text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            extracted_text += page.get_text("text")

        # Initialize placeholders
        found_substance_section = False
        substance_tables = []
        
        # Search for section and extract tables using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                
                # If section title is found
                if not found_substance_section and "IV. SUBSTANCE USE HISTORY & ASSESSMENT" in text:
                    found_substance_section = True
        
                # Once we've found the section, extract tables
                if found_substance_section:
                    table = page.extract_table()
                    if table:
                        substance_tables.append(table)
                    
                    # Optional: break if you detect the next section heading
                    if "V." in text:  # crude way to stop at next main section
                        break

        # Flatten if multiple tables were found
        flat_table = []
        for table in substance_tables:
            flat_table.extend(table)
        # Combine extracted data
        result = {
            "group_identifier": group_identifier,
            "found_substance_section": found_substance_section,
            "substance_table": flat_table if flat_table else None,
        }

        data.append(result)

In [5]:
df = pd.DataFrame(data)

In [6]:
# Initialize a list to store the flattened data
flattened_data = []

In [7]:
# Iterate through all the rows in the DataFrame
for index, row in df.iterrows():
    group_identifier = row['group_identifier']
    substance_table = row['substance_table']
    
    # The header is the first row of the table
    if substance_table:
        header = substance_table[0]  # Get the header
        
        # Iterate through the substances in the table (skip the header)
        for substance_row in substance_table[1:]:
            # Create a dictionary for each substance row with the corresponding group_identifier
            substance_data = {
                "group_identifier": group_identifier,
                "substance": substance_row[0],  # First column is the substance name
                "first_used": substance_row[1],
                "last_used": substance_row[2],
                "frequency_duration": substance_row[3],
                "amount": substance_row[4],
                "method": substance_row[5],
                "pattern_of_use": substance_row[6]
            }
            
            # Add this substance data to the flattened list
            flattened_data.append(substance_data)

# df of flattened data
flattened_df = pd.DataFrame(flattened_data)

In [8]:
# 'use_flag' based on whether 'first_used' or 'last_used' is not blank
flattened_df['use_flag'] = (
    ((flattened_df['first_used'].notna() & (flattened_df['first_used'] != 'NA') & (flattened_df['first_used'].str.strip() != '')) |
     (flattened_df['last_used'].notna() & (flattened_df['last_used'] != 'NA') & (flattened_df['last_used'].str.strip() != '')))
    .astype(int)
)

In [9]:
sparse_df = flattened_df[['group_identifier','substance','use_flag','pattern_of_use']]

In [10]:
sparse_df['pattern_of_use'].value_counts()

pattern_of_use
                              393
continued                      18
experimental                   13
episodic                       12
binge                          10
daily                           8
socially                        5
social                          3
na                              2
continual                       2
Episodic/binge                  1
Continued                       1
Episodic or binge               1
prescribed prn                  1
recreationally                  1
for surgery                     1
an episode                      1
binge, continued                1
mentally to reduce anxiety      1
mental and emotional            1
once in a while                 1
N/A                             1
Binges                          1
ocationally                     1
trail                           1
Binge                           1
recreational                    1
contunued                       1
NA                              1

In [11]:
# mapping of patterns to consolidated categories
pattern_mapping = {
    'continued': 'Continued',
    'Continued': 'Continued',
    'contunued': 'Continued',
    'Binge, continued': 'Binge/Continued',
    'Binge episodes': 'Binge/Episodic',
    'binge/episodic': 'Binge/Episodic',
    'episodic': 'Binge/Episodic',
    'Episodic/binge': 'Binge/Episodic',
    'Episodic or binge': 'Binge/Episodic',
    'Binge': 'Binge/Episodic',
    'Binges': 'Binge/Episodic',
    'experimental': 'Experimental',
    'social': 'Experimental',
    'socially': 'Experimental',
    'recreational': 'Experimental',
    'recreationally': 'Experimental',
    'socially': 'Experimental',
    'daily': 'Daily',
    'na': 'NA',
    'NA': 'NA',
    'N/A': 'NA',
    'prescribed prn': 'Prescribed',
    'as prescribed for sleep': 'Prescribed',
    'for surgery': 'Prescribed',
    'once in a while': 'Experimental',
    'mental and emotional': 'Prescribed',
    'ocationally': 'Occasionally',  # typo
    'trail': 'Experimental'  # typo
}

In [12]:
# mapping to the 'pattern_of_use' column
sparse_df = sparse_df.copy()

# Now you can safely apply the mapping without the warning
sparse_df.loc[:, 'pattern_of_use_consolidated'] = sparse_df['pattern_of_use'].map(pattern_mapping).fillna(sparse_df['pattern_of_use'])

In [13]:
sparse_df.to_csv('patient_substance_history.csv')