# Imports

In [1]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
import random

my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
# my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

In [None]:
file_path = my_computer_fpath + "e26f9ccc-68a4-42b4-9d0d-508a83026a1c.csv"

raw_df = pd.read_csv(file_path)
raw_df.head()

# Initialize dataframe

In [3]:
df = raw_df.copy()

In [None]:
pd.set_option("display.max_columns", None)
df.head()

# Initial Data Cleaning

## Explode |-separated notes

In [5]:
anes_procedure_cols = ['anes_procedure_type_2253', 'anes_procedure_start_dts_2254', 'anes_procedure_anesthesiologist_2255', 'anes_procedure_resident_2256', 'anes_procedure_pt_position_2257', 'anes_procedure_approach_2258', 'anes_procedure_location_2259', 'anes_procedure_note_id_2260', 'anes_procedure_dos_dts_2261', 'anes_procedure_dpe_2262', 'anes_procedure_epidural_needle_2263', 'anes_procedure_epidural_needle_gauge_2264', 'anes_procedure_lor_depth_2265', 'anes_procedure_catheter_depth_2266', 'anes_procedure_spinal_needle_type_2267', 'anes_procedure_spinal_needle_gauge_2268', 'anes_procedure_spinal_needle_length_2269', 'anes_procedure_paresthesias_2270', 'anes_procedure_note_text_2271','anes_procedure_encounter_id_2273']


In [None]:
# Expand the items in anes_procedure_cols separated by "|" into a separate row
# Requires that within a row, each element in these columns has the same number of |-separated values

# Split the columns with '|' delimiter
for col in anes_procedure_cols:
    df[col] = df[col].str.split('\|')

# Explode the DataFrame
df = df.explode(anes_procedure_cols)

# Reset the index after exploding the DataFrame so each individual note will be its own unique row and index
df = df.reset_index(drop=True)
df[['id','anes_procedure_type_2253']].head(20)

Note that other procedure types, including Blood Patch but also A-lines, nerve blocks, and POCUS orders, are currently parsed by Merlin to NaN

In [None]:
df['anes_procedure_type_2253'].value_counts(dropna=False)

## Bring in RAW info

This is needed at the moment to get the NotePurposeDSC (to help eliminate near-duplicate notes)
and also to RegEx the Number of Attempts

In [8]:
raw_identified_data = pd.read_csv(my_computer_fpath + "Full Identified raw anesthesia_procedure_notes.csv")

### NotePurposeDSC

In [9]:
note_to_purpose = raw_identified_data.set_index('NoteID')['NotePurposeDSC'].to_dict()
df['NotePurposeDSC'] = df['anes_procedure_note_id_2260'].map(note_to_purpose)

In [None]:
df.shape

In [11]:
df = df.dropna(subset=['NotePurposeDSC'])

In [None]:
df.shape

### NoteTXT

In [13]:
note_to_text = raw_identified_data.set_index('NoteID')['NoteTXT'].to_dict()
df['NoteTXT'] = df['anes_procedure_note_id_2260'].map(note_to_text)

In [14]:
def get_numeric_structured_info_from_full_note(regex, note_text):
  # Use RegEx to capture the requested structured data
  # If no matches, return NaN
  # If multiple matches, will ensure they are all equal, else return NaN
  matches = re.findall(regex, note_text)
  if len(matches) == 0:
    return np.nan

  if len(matches) == 1:
    return float(matches[0])

  if len(matches) > 1:
    match_list = []
    for match in matches:
      match_list.append(match)

    if all(x == match_list[0] for x in match_list):
      return float(match_list[0])
    else:
      return np.nan



def get_number_of_neuraxial_attempts(note_text):
  return get_numeric_structured_info_from_full_note('Number of attempts: (\\d+)', note_text)
  # Note: the CSE template does not include this info

In [15]:
df['number_of_neuraxial_attempts'] = df['NoteTXT'].apply(get_number_of_neuraxial_attempts)

## Handle datetime issues

Bug: Merlin is bringing anes_procedure_dos_dts_2261 as Eastern times when in fact they are UTC. I resolve this by editing the raw strings before conversion to datetime objects.

Bug: The same is true for delivery_time. I resolve this by editing the raw strings before conversion to datetime objects.

Bug: Because delivery_date is stored separately from delivery_time, if the UTC change causes the time to go to the next day, this is NOT reflected in the updated delivery_date.

Bug: Merlin ignores AM/PM in anes_procedure_start_dts_2254 and assumes all entries are AM. I resolve this (for now) by ignoring these written start times and just using dos_dts

In [None]:
df['delivery_time'].head()

In [17]:
df['delivery_time_stripped'] = df['delivery_time'].str.replace(r'[+-]\d{2}:*\d{2}$', '+0000', regex=True)

In [None]:
df['delivery_time_stripped'].head()

In [19]:
df['delivery_datetime_unadjusted'] = pd.to_datetime(df['delivery_date'] + ' ' + df['delivery_time_stripped'],utc=True)

In [None]:
df['delivery_datetime_unadjusted'].iloc[1].tz_convert('US/Eastern')

In [21]:
# Function to adjust by one day if time is before 0400 (DST) or 0500 (ST)
def adjust_time_based_on_dst(timestamp):
    # Adjust cutoff time based on DST or standard time
    cutoff_time = pd.Timestamp('04:00:00').time() if timestamp.tz_convert('US/Eastern').dst() != pd.Timedelta(0) else pd.Timestamp('05:00:00').time()
    # Add 24 hours if the time is earlier than the cutoff
    return timestamp + pd.Timedelta(hours=24) if timestamp.time() < cutoff_time else timestamp

In [22]:
df['delivery_datetime'] = df['delivery_datetime_unadjusted'].apply(adjust_time_based_on_dst)

In [None]:
df[['delivery_datetime', 'delivery_datetime_unadjusted']].head()

In [24]:
df['maternal_dob'] = pd.to_datetime(df['maternal_dob_2043'],utc=True)

In [None]:
df['anes_procedure_dos_dts_2261'].head()

In [26]:
df['dos_dts_tz_stripped'] = df['anes_procedure_dos_dts_2261'].str.replace(r'[+-]\d{2}:*\d{2}$', '+0000', regex=True)

In [None]:
df['dos_dts_tz_stripped'].head()

In [28]:
df['dos_dts'] = pd.to_datetime(df['dos_dts_tz_stripped'])

In [None]:
df[['dos_dts','anes_procedure_dos_dts_2261']].head()

In [30]:
df['start_dts'] = pd.to_datetime(df['anes_procedure_start_dts_2254'],format='mixed',utc=True)

In [None]:
# Extract the time part of the 'start_dts' column to check whether it covers all 24 h or only 12 h due to AM/PM bug
df[df['start_dts'].notna()]['start_dts'].dt.time.sort_values()


In [32]:
# This code has been changed to avoid the AM/PM bug

# df['best_timestamp'] = df['start_dts'].fillna(df['dos_dts'])
df['best_timestamp'] = df['dos_dts']

In [None]:
df['best_timestamp'].head()

## Handle near-duplicate notes

There is also a column "NotePurposeDSC" in the raw EDW data that can be "ADDENDUM" or "NORMAL" or blank. When there are duplicate notes, the first one will be blank and subsequent ones will be ADDENDUM. I use this fact upstream and delete all the ones that are blank.

IMPORTANT: It turns out to be the case that there are sometimes, genuinely in Epic, two procedures done within only a few mins of each other - and these represent a repeat procedure, not a duplicate note. Therefore I now only call notes duplicates if their timestamps EXACTLY match (minute_offset == 0)

In [None]:
# test behavior on a known double-note
df.loc[df['anes_procedure_note_id_2260'] == '1188076153']

In [35]:
# test behavior on a known near-duplicate note
df[df['anes_procedure_note_id_2260'] == '2250605132']
known_near_duplicate_encounter_id = df[df['anes_procedure_note_id_2260'] == '2250605132']['anes_procedure_encounter_id_2273'].iloc[0]


In [None]:
known_near_duplicate_group = df.groupby('anes_procedure_encounter_id_2273').get_group(known_near_duplicate_encounter_id)
known_near_duplicate_group

In [37]:
# prompt: add 'best_timestamp', 'dos_dts', and 'start_dts' to anes_procedure_cols

anes_procedure_cols.extend(['best_timestamp', 'dos_dts', 'start_dts'])

In [38]:
# need to narrow operations to a smaller group of columns for efficiency

df_anes_procedure_cols = df[anes_procedure_cols]

In [39]:
# Functions to label near_duplicate procedures

# Compare two rows and return True if their timestamps are within minute_offset
# and their compare_cols match
def check_if_near_duplicate(row1, row2, compare_cols, minute_offset):
  for col in compare_cols:
    if not pd.isnull(row1[col]) and not pd.isnull(row2[col]):
      if row1[col] != row2[col]:
        return False
  if abs(row1['best_timestamp'] - row2['best_timestamp']) > pd.Timedelta(minutes=minute_offset):
    # if abs(row1['best_timestamp'] - row2['best_timestamp']) < pd.Timedelta(minutes=60):
    #   print(row1['anes_procedure_note_id_2260'], row2['anes_procedure_note_id_2260'], row1['best_timestamp'], row2['best_timestamp'])
    return False
  return True


# Label near_duplicate notes within an encounter using the check_if_near_duplicate function
def label_near_duplicate_notes(encounter):

  indices = encounter.index.tolist()

  for i in range(len(indices)):
    base_idx = indices[i]
    base_row = encounter.loc[base_idx]
    has_near_duplicate = 0
    near_duplicates = [base_row['anes_procedure_note_id_2260']]

    for j in range(len(indices)):
      if i == j:
        continue # don't identify self-duplicates
      compare_idx = indices[j]
      compare_row = encounter.loc[compare_idx]


      if check_if_near_duplicate(base_row, compare_row, ['anes_procedure_type_2253'], minute_offset = 0):
        has_near_duplicate = 1
        near_duplicates.append(compare_row['anes_procedure_note_id_2260'])

    encounter.at[base_idx, 'has_near_duplicate'] = has_near_duplicate
    encounter.at[base_idx, 'near_duplicate_note_ids'] = str(sorted(near_duplicates))

  return encounter


In [None]:
# Label near_duplicate procedures
# Takes ~2 mins

df_anes_procedure_cols['has_near_duplicate'] = 0
df_anes_procedure_cols['near_duplicate_note_ids'] = None
df_anes_procedure_cols = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').apply(label_near_duplicate_notes, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('anes_procedure_encounter_id_2273')

In [41]:
# prompt: sort df_anes_procedure_cols by index

df_anes_procedure_cols = df_anes_procedure_cols.sort_index()

In [42]:
# Count blank columns
df_anes_procedure_cols['blank_anes_procedure_element_col_counts'] = df_anes_procedure_cols[anes_procedure_cols].isnull().sum(axis=1)

In [43]:
# Within a group of duplicates, label the one with the fewest blank columns as NOT the worse duplicate (i.e., the best)
# Takes ~2 mins
def label_worse_near_duplicates(near_duplicate_set):
  near_duplicate_set.at[near_duplicate_set['blank_anes_procedure_element_col_counts'].idxmin(), 'is_worse_near_duplicate'] = 0
  return near_duplicate_set

df_anes_procedure_cols['is_worse_near_duplicate'] = df_anes_procedure_cols['has_near_duplicate']
df_anes_procedure_cols = df_anes_procedure_cols.groupby('near_duplicate_note_ids').apply(label_worse_near_duplicates, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('near_duplicate_note_ids')

In [None]:
known_near_duplicate_group = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').get_group(known_near_duplicate_encounter_id)
known_near_duplicate_group

In [None]:
# test behavior on a known double-note
known_double_note = df_anes_procedure_cols.loc[df_anes_procedure_cols['anes_procedure_note_id_2260'] == '1188076153']
known_double_note

In [None]:
df_anes_procedure_cols['is_worse_near_duplicate'].value_counts(dropna=False)

In [None]:
df_anes_procedure_cols.shape

In [48]:
# Remove worse duplicates
df_anes_procedure_cols = df_anes_procedure_cols[df_anes_procedure_cols['is_worse_near_duplicate']==0]

In [None]:
df_anes_procedure_cols.shape

## Address cases where an epidural note followed by a spinal note is actually a planned CSE, not a failed catheter. Also address what 'epidural/intrathecal' really means.

Secret CSEs are spinal and epidural within 5 mins

Epidural/intrathecal notes are declared epidural unless ***

In [50]:
# Functions to label secret_CSE procedures

# Compare two rows and return True if exactly one is an epidural, exactly one is a spinal,
# and if their timestamps are within minute_offset
def check_if_secret_CSE(row1, row2, minute_offset):
  if abs(row1['best_timestamp'] - row2['best_timestamp']) < pd.Timedelta(minutes=minute_offset):
    if row1['anes_procedure_type_2253'] == 'epidural/intrathecal' or row1['anes_procedure_type_2253'] == 'epidural':
      if row2['anes_procedure_type_2253'] == 'spinal':
        return True
    if row2['anes_procedure_type_2253'] == 'epidural/intrathecal' or row2['anes_procedure_type_2253'] == 'epidural':
      if row1['anes_procedure_type_2253'] == 'spinal':
        return True
  return False

# Label secret_CSE notes within an encounter using the check_if_secret_CSE function
def label_secret_CSE_notes(encounter):

  indices = encounter.index.tolist()

  for i in range(len(indices)):
    base_idx = indices[i]
    base_row = encounter.loc[base_idx]
    is_secret_CSE = 0
    secret_CSEs = [base_row['anes_procedure_note_id_2260']]

    for j in range(len(indices)):
      if i == j:
        continue # don't identify self-duplicates
      compare_idx = indices[j]
      compare_row = encounter.loc[compare_idx]

      if check_if_secret_CSE(base_row, compare_row, minute_offset = 5):
        is_secret_CSE = 1
        secret_CSEs.append(compare_row['anes_procedure_note_id_2260'])

    encounter.at[base_idx, 'is_secret_CSE'] = is_secret_CSE
    encounter.at[base_idx, 'secret_CSE_note_ids'] = str(sorted(secret_CSEs))

  return encounter


In [51]:
# Label secret_CSE procedures
# Takes ~2 mins

df_anes_procedure_cols['is_secret_CSE'] = 0
df_anes_procedure_cols['secret_CSE_note_ids'] = None
df_anes_procedure_cols = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').apply(label_secret_CSE_notes, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('anes_procedure_encounter_id_2273')

In [None]:
df_anes_procedure_cols['is_secret_CSE'].value_counts()

In [None]:
df_anes_procedure_cols[df_anes_procedure_cols['is_secret_CSE'] == 1].head(10)

In [54]:
# Eliminate the separately-documented spinals that are really part of CSEs

# Delete rows where procedure_type is spinal and is_secret_CSE is true
df_anes_procedure_cols = df_anes_procedure_cols[~((df_anes_procedure_cols['anes_procedure_type_2253'] == 'spinal') & (df_anes_procedure_cols['is_secret_CSE'] == 1))]

In [55]:
# Label true intrathecal catheters
# NOTE: DOES NOT YET RECLASSIFY EPIDURAL/INTRATHECALS BY CSF ASPIRATION OR ANY OTHER METHOD

df_anes_procedure_cols['is_intrathecal_catheter'] = (df_anes_procedure_cols['anes_procedure_type_2253'] == 'intrathecal').astype(int)

In [56]:
# prompt: label true_procedure_type by reclassifying based on is_secret_CSE and is_intrathecal_catheter

# Create the 'true_procedure_type' column based on the conditions
df_anes_procedure_cols['true_procedure_type'] = np.where(
    df_anes_procedure_cols['is_secret_CSE'] == True,'cse',
    df_anes_procedure_cols['anes_procedure_type_2253'])

# Update 'true_procedure_type' based on 'is_intrathecal_catheter'
df_anes_procedure_cols.loc[
    (df_anes_procedure_cols['true_procedure_type'].isin(['epidural/intrathecal', 'intrathecal'])) &
    (df_anes_procedure_cols['is_intrathecal_catheter'] == True),
    'true_procedure_type'] = 'intrathecal'

df_anes_procedure_cols.loc[
    (df_anes_procedure_cols['true_procedure_type'] == 'epidural/intrathecal') &
    (df_anes_procedure_cols['is_intrathecal_catheter'] == False),
    'true_procedure_type'] = 'epidural'

In [None]:
df['anes_procedure_type_2253'].value_counts()

In [None]:
df_anes_procedure_cols['true_procedure_type'].value_counts()

In [None]:
df_anes_procedure_cols.head()

# Classify failures

In [60]:
df_anes_procedure_cols['is_neuraxial_catheter'] = (df_anes_procedure_cols['true_procedure_type'].isin(['cse', 'epidural', 'intrathecal'])).astype(int)
df_anes_procedure_cols['is_spinal'] = (df_anes_procedure_cols['true_procedure_type'] == 'spinal').astype(int)
df_anes_procedure_cols['is_airway'] = (df_anes_procedure_cols['true_procedure_type'] == 'airway').astype(int)

In [61]:
# Vectorized method to classify as successes or failures
# takes ~10 mins

def classify_encounter_failures(encounter):

    # Identify rows where 'is_neuraxial_catheter' == 1
    neuraxial_rows = encounter[encounter['is_neuraxial_catheter'] == 1]

    # If no neuraxial catheter procedures, return encounter as is
    if neuraxial_rows.empty:
        return encounter

    # Create a mask for failure-defining events within the encounter
    # Failure-defining events are neuraxial catheters, spinals, and airways
    failure_defining_event_mask = encounter[['is_neuraxial_catheter','is_spinal','is_airway']].any(axis=1)

    # Get the indices of events
    failure_defining_event_indices = encounter.index[failure_defining_event_mask]

    # Iterate over neuraxial catheter rows
    for idx in neuraxial_rows.index:
        current_time = encounter.at[idx, 'best_timestamp']

        # Find subsequent events
        # This relies on correct ordering by best_timestamp
        subsequent_failure_defining_events = encounter.loc[failure_defining_event_indices]
        subsequent_failure_defining_events = subsequent_failure_defining_events[subsequent_failure_defining_events['best_timestamp'] > current_time]

        # Initialize flags
        has_subsequent_neuraxial_catheter = 0
        has_subsequent_spinal = 0
        has_subsequent_airway = 0
        failed_catheter = 0
        subsequent_proof_of_failure_note_id = None

        # Check for subsequent procedures
        if not subsequent_failure_defining_events.empty:
            # Update flags based on any occurrence in subsequent events
            has_subsequent_neuraxial_catheter = int((subsequent_failure_defining_events['is_neuraxial_catheter'] == 1).any())
            has_subsequent_spinal = int((subsequent_failure_defining_events['is_spinal'] == 1).any())
            has_subsequent_airway = int((subsequent_failure_defining_events['is_airway'] == 1).any())
            failed_catheter = int(has_subsequent_neuraxial_catheter or has_subsequent_spinal or has_subsequent_airway)
            subsequent_proof_of_failure_note_id = subsequent_failure_defining_events['anes_procedure_note_id_2260'].tolist()

            encounter.at[idx, 'has_subsequent_neuraxial_catheter'] = has_subsequent_neuraxial_catheter
            encounter.at[idx, 'has_subsequent_spinal'] = has_subsequent_spinal
            encounter.at[idx, 'has_subsequent_airway'] = has_subsequent_airway
            encounter.at[idx, 'failed_catheter'] = failed_catheter
            encounter.at[idx, 'subsequent_proof_of_failure_note_id'] = str(subsequent_proof_of_failure_note_id)

    return encounter

df_anes_procedure_cols['has_subsequent_neuraxial_catheter'] = 0
df_anes_procedure_cols['has_subsequent_spinal'] = 0
df_anes_procedure_cols['has_subsequent_airway'] = 0
df_anes_procedure_cols['failed_catheter'] = 0
df_anes_procedure_cols['subsequent_proof_of_failure_note_id'] = None

df_anes_procedure_cols = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').apply(classify_encounter_failures, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('anes_procedure_encounter_id_2273')

In [None]:
df_anes_procedure_cols.head(10)

# Rebuild df by merging with df_anes_procedure_cols

Note that rows have been eliminated from df_anes_procedure_cols in two steps: as is_worse_near_duplicate, and as is_secret_cse (note that only the spinal half of the is_secret_cse cases are removed since the epidural half become the CSEs)

In [None]:
df.shape

In [None]:
df_anes_procedure_cols.shape

In [65]:
# prompt: concatenate new columns from df_anes_procedure_cols into df. only bring the new columns, leave behind the matching ones. Select the new columns via code.

# Identify new columns in df_anes_procedure_cols that are not in df
new_cols = [col for col in df_anes_procedure_cols.columns if col not in df.columns]

# Merge df with df_anes_procedure_cols on the index, only keeping rows that exist in both
df = pd.merge(df, df_anes_procedure_cols[new_cols], left_index=True, right_index=True, how='inner')

In [None]:
df.shape

In [67]:
df['is_neuraxial_catheter'] = df['is_neuraxial_catheter'] == 1
df['failed_catheter'] = df['failed_catheter'] == 1

In [68]:
# Reorder columns to move 'is_neuraxial_catheter' and 'failed_catheter' to the front
cols = ['is_neuraxial_catheter', 'failed_catheter'] + [col for col in df.columns if col not in ['is_neuraxial_catheter', 'failed_catheter']]
df = df[cols]

In [None]:
df.head()

In [None]:
# test behavior on a known double-note
known_double_note = df.loc[df['anes_procedure_note_id_2260'] == '1188076153']
known_double_note

In [None]:
df[df['failed_catheter'] == 1].head(10)

In [72]:
known_failed_catheter_encounter_ids = ['3259099621','3081317750', '3081399139', '3081675427', '3081686082', '3081711691', '3081729928', '3081884584', '3081893356', '3082275619', '3082349091']

In [None]:
df[df['anes_procedure_encounter_id_2273'].isin(known_failed_catheter_encounter_ids)]

# Additional Data Cleaning and Feature Engineering

## Count prior failed neuraxials in this encounter and failed and total across all encounters

Takes ~4 mins for each

In [74]:
df = df.sort_values(by='best_timestamp', ascending=True)

In [75]:
# These are separate functions so that they can be applied to the DataFrame in a vectorized manner without needing to take the new_column_name as an argument,
# which doesn't work well with the .apply() method

def count_prior_failed_catheters_this_enc(group):
    group['prior_failed_catheters_this_enc'] = (group['failed_catheter'].cumsum() - group['failed_catheter']).astype(float)
    return group

def count_prior_failed_catheters_all_enc(group):
    group['prior_failed_catheters_all_enc'] = (group['failed_catheter'].cumsum() - group['failed_catheter']).astype(float)
    return group

def count_prior_all_catheters_all_enc(group):
    group['prior_all_catheters_all_enc'] = (group['is_neuraxial_catheter'].cumsum() - group['is_neuraxial_catheter']).astype(float)
    return group

In [76]:
df = df.groupby('anes_procedure_encounter_id_2273').apply(count_prior_failed_catheters_this_enc, include_groups = False)
df = df.reset_index('anes_procedure_encounter_id_2273')

In [77]:
df = df.groupby('epic_pmrn').apply(count_prior_failed_catheters_all_enc, include_groups = False)
df = df.reset_index('epic_pmrn')

In [78]:
df = df.groupby('epic_pmrn').apply(count_prior_all_catheters_all_enc, include_groups = False)
df = df.reset_index('epic_pmrn')

In [79]:
df['prior_failed_catheters_prev_enc'] = df['prior_failed_catheters_all_enc'] - df['prior_failed_catheters_this_enc']

In [None]:
df[df['anes_procedure_encounter_id_2273'] == '3258959083']

In [None]:
df[df['anes_procedure_encounter_id_2273'] == '3147096491']

In [None]:
df[df['anes_procedure_encounter_id_2273'] == '3227352323']


## Handle timeseries data (e.g., pain scores)

In [None]:
# Extracts the pain scores prior to the timestamp
# Takes ~ 1 minute
def get_pain_scores_prior_to_timestamp(row, best_timestamp_col="best_timestamp"):
    """
    Extract all pain scores that have timestamp < row[best_timestamp_col].

    row: a single row of your DataFrame (a pd.Series)
    best_timestamp_col: name of the column in your DataFrame that contains
                       the 'best_timestamp' to compare against

    Returns a list of 'prior' scores or NaN if none exist.
    """
    # Extract the raw strings
    times_str = row["timeseries_intrapartum_pain_score_datetime_2242"]
    scores_str = row["timeseries_intrapartum_pain_score_2242"]

    # If either is missing, return NaN
    if pd.isna(times_str) or pd.isna(scores_str):
        return np.nan

    # Convert to lists
    times_list = times_str.split("|")
    scores_list = scores_str.split("|")

    # Safely convert both times and best_timestamp to datetime
    try:
        times_dt = pd.to_datetime(times_list)
        # This assumes your row also has a column called best_timestamp_col
        best_dt = pd.to_datetime(row[best_timestamp_col])
    except:
        # If conversion fails, return NaN
        return np.nan

    # Filter out all scores whose timestamp is strictly less than best_timestamp
    prior_scores = []
    for t, s in zip(times_dt, scores_list):
        if t < best_dt:
            prior_scores.append(float(s))

    # If no scores remain, return NaN, else return them joined or as list
    return prior_scores if prior_scores else np.nan

df['prior_pain_scores'] = df.apply(get_pain_scores_prior_to_timestamp, axis=1)

In [84]:
df["prior_pain_scores_max"] = df["prior_pain_scores"].apply(
    lambda scores: max(map(float, scores)) if isinstance(scores, list) and scores else np.nan)

In [None]:
df['prior_pain_scores_max'].head(50)

## Clean DPE and LOR_Depth

In [86]:
# make 'dpe' True/False
df['dpe'] = df['anes_procedure_dpe_2262'] == 'yes'

In [87]:
df['true_procedure_type_incl_dpe'] = df['true_procedure_type']
df.loc[df['dpe'] == True, 'true_procedure_type_incl_dpe'] = 'dpe'

In [88]:
# make 'lor_depth' numeric
df['lor_depth'] = df['anes_procedure_lor_depth_2265'].replace('', np.nan).astype(float)

In [None]:
# Code to evaluate suspiciously high LORs
# For these, if we divide LOR by 10, the the catheter is taped around 4-5 cm deeper
# So most likely these suspiciously high LORs are missing decimal points
high_LORs = df.sort_values(by='lor_depth',ascending=False).head(100)['lor_depth']
print(high_LORs.to_list())
plt.hist(high_LORs)

print(df.sort_values(by='lor_depth',ascending=False).head(100)['anes_procedure_catheter_depth_2266'].to_list())

In [90]:
# prompt: lor_depth = lor_depth / 10 if lor_depth > 20

df['lor_depth'] = np.where(df['lor_depth'] > 20, df['lor_depth'] / 10, df['lor_depth'])

In [None]:
# Code to evaluate suspiciously high LORs
high_LORs = df.sort_values(by='lor_depth',ascending=False).head(100)['lor_depth']
print(high_LORs.to_list())
plt.hist(high_LORs)

## Make numerical columns numerical

In [92]:
# prompt: set these columns to dtype float: bmi_end_pregnancy_2044, maternal_weight_end_pregnancy_2045, maternal_height_2046,gravidity_2047,parity_2048

# Convert specified columns to float dtype
columns_to_convert = ['gestational_age_2052','bmi_end_pregnancy_2044', 'maternal_weight_end_pregnancy_2045', 'maternal_height_2046', 'gravidity_2047', 'parity_2048','baby_weight_2196','bmi_before_pregnancy_2161','secs_rom_thru_delivery_2197']

for col in columns_to_convert:
    if col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
        except KeyError:
            print(f"Column '{col}' not found in the DataFrame.")
    else:
        print(f"Column '{col}' not found in the DataFrame.")

## Calculate and plausibilify elapsed times

In [None]:
df['rom_thru_delivery_hours'] = df['secs_rom_thru_delivery_2197'] / 3600
df['rom_thru_delivery_hours'].describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.98,0.99])

In [94]:
# If ROM through Delivery is more than 30 days, assume erroneous and make it NaN
df['rom_thru_delivery_hours'] = np.where(df['rom_thru_delivery_hours'] <= 30*24, df['rom_thru_delivery_hours'],np.nan)

In [None]:
df['maternal_age_years'] = (df['delivery_datetime'] - df['maternal_dob']).dt.days / 365.25
df['maternal_age_years'].describe()

In [96]:
df['placement_to_delivery_hours'] = (df['delivery_datetime'] - df['best_timestamp']).dt.total_seconds() / 3600

In [None]:
df[df['true_procedure_type'] == 'epidural']['placement_to_delivery_hours'].describe(percentiles=[0.01,0.02,0.05,0.10,0.15,0.25,0.2,0.5,0.75,0.95,0.98,0.99])

In [98]:
df['placement_to_delivery_hours'] = np.where((df['placement_to_delivery_hours'] > -1) & (df['placement_to_delivery_hours'] <= 7*24),
                                             df['placement_to_delivery_hours'], np.nan)

In [None]:
df[df['true_procedure_type'] == 'epidural']['placement_to_delivery_hours'].describe(percentiles=[0.01,0.02,0.05,0.10,0.15,0.25,0.2,0.5,0.75,0.95,0.98,0.99])

In [None]:
df.loc[df['placement_to_delivery_hours'].sort_values(ascending=True).head().index,:][['anes_procedure_note_id_2260','anes_procedure_type_2253','placement_to_delivery_hours','delivery_datetime','best_timestamp']]

In [None]:
df.loc[df['placement_to_delivery_hours'].sort_values(ascending=False).head().index,:]

From the above analyses, procedures where many days elapse between placement and delivery are NOT labor analgesia procedures. They can be totally unrelated procedures like knee surgery, or obstetrical procedures like ECVs, or (rarely) analgesia for false labor. In the latter case, if labor does not progress and the patient returns to antepartum, the anesthesia encounter will termiante and a new encounter will be used for subsequent labor. In that case, an epidural placed in the second encounter will NOT prove failure of the first since it will have a different encounter_id.

For these reasons, I eliminate rows where there is more than 7 days between placement and delivery.

Due to the UTC bug discussed above, a true 1859 EPL followed by 1900 delivery would be translated to 2359 EPL AFTER 0000 delivery (without the delivery_date incrementing appropriately)

A more thorough algorithm could look at the timing of Anesthesia Stop compared to delivery, and/or confirm that the title of the anesthesia encounter is Labor Epidural or Cesarean Section.

In [None]:
df.shape

In [None]:
df.dropna(subset=['placement_to_delivery_hours']).shape

Include other limits on plausible data for each feature

## Handle proceduralist names

In [104]:
# Function to regulate names
def regulate_name(name):

    # Remove degrees and titles
    name = re.sub(r',?\s*(md|do|mbbs|phd|ms|mba|mph|msc|crna)\b', '', name, flags=re.IGNORECASE)

    # Split last name and first name if comma exists
    if ',' in name:
        last, first = name.split(',', 1)
        name = f"{first.strip()} {last.strip()}"

    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()

    # Remove middle names
    parts = name.split()
    if len(parts) > 2 :
      name = f"{parts[0]} {parts[-1]}"

    # Capitalize each part of the name
    name = name.title()

    return name

# Apply the function to regulate names
df['Regulated_Anesthesiologist_Name'] = df['anes_procedure_anesthesiologist_2255'].dropna().apply(regulate_name)
df['Regulated_Resident_Name'] = df['anes_procedure_resident_2256'].dropna().apply(regulate_name)

In [105]:
# prompt: set all blank Regulated_Anesthesiologist_Name and Regulated_Resident_Name to NaN

df['Regulated_Anesthesiologist_Name'] = df['Regulated_Anesthesiologist_Name'].replace('', np.nan)
df['Regulated_Resident_Name'] = df['Regulated_Resident_Name'].replace('', np.nan)

In [106]:
# prompt: For each catheter, count how many (i.e., earlier best_timestamp) catheters were done by that provider (including the current one)

df = df.sort_values('best_timestamp')

df['current_anesthesiologist_catheter_count'] = (
    df.groupby('Regulated_Anesthesiologist_Name')['is_neuraxial_catheter']
      .cumsum()
)

df['current_resident_catheter_count'] = (
    df.groupby('Regulated_Resident_Name')['is_neuraxial_catheter']
      .cumsum()
)

In [107]:
df['highly_experienced_anesthesiologist'] = np.where(df['current_anesthesiologist_catheter_count'] > 500, 'yes',
                                                    np.where(df['current_anesthesiologist_catheter_count'] <= 500, 'no', 'none'))

In [108]:
df['moderately_experienced_anesthesiologist'] = np.where(df['current_anesthesiologist_catheter_count'] > 100, 'yes',
                                                        np.where(df['current_anesthesiologist_catheter_count'] <= 100, 'no', 'none'))

In [109]:
# prompt: set df['highly_experienced_resident'] to 1 if current_resident_catheter_count > 50, to 0 if <= 50, and to -1 if NaN

df['highly_experienced_resident'] = np.where(df['current_resident_catheter_count'] > 50, 'yes',
                                                    np.where(df['current_resident_catheter_count'] <= 50, 'no', 'none'))

## Feature engineering on categorical variables

In [110]:
df['has_scoliosis'] = df['icd_scoliosis_2053'] == True

In [111]:
df['has_dorsalgia'] = df['icd_dorsalgia_2104'] == True

In [112]:
# prompt: create a column "has_back_problems" that is 1 where any of the following are True, else 0. Handle NaN.

# Define the columns related to back problems
back_problem_cols = [
    'icd_scoliosis_2053',
    'icd_spinal_fusion_2056',
    'icd_congenital_deformity_spine_2059',
    'icd_ra_and_sctds_2086',
    'icd_kyphosis_and_lordosis_2089',
    'icd_spinal_osteochondrosis_2092',
    'icd_spondylopathies_and_deforming_dorsopathies_2095',
    'icd_intervertebral_disc_disorders_2098',
    'icd_ehlers_minus_danlos_2101',
]

# Note that spondyolopathies_and_deforming_dorsopathies are by far the biggest contributors

# Create the 'has_back_problems' column
df['has_back_problems'] = df[back_problem_cols].any(axis=1)

In [113]:
df['maternal_race'] = np.select(
    [
        df['maternal_race_2111'] == 'White',
        df['maternal_race_2111'] == 'Asian',
        df['maternal_race_2111'] == 'Black'
    ],
    [
        'White',
        'Asian',
        'Black'
    ],
    default='Other/Unknown'
)

In [114]:
composite_social_columns = [
    "drug_abuse_during_parent_2144",
    "high_risk_social_problems_parent_2154",
    "high_risk_insufficient_antenatal_care_parent_2157",
    "icd_major_mental_health_disorder_2178",
    "education_problems_2203",
    "employment_problems_2206",
    "adverse_occupational_2209",
    "housing_problems_2212",
    "adjustment_problems_2215",
    "relationship_problems_2218",
    "other_psychosocial_2221",
    "smoker_during_pregnancy_parent_2117",
    "drug_abuse_before_parent_2142",
    "alcohol_during_parent_2147",
]

df['composite_psychosocial_problems'] = df[composite_social_columns].any(axis=1)

In [115]:
# prompt: create column 'only_private_insurance' for any row where public_insurance_2114 does NOT contains the string "public", ignore case

# Assuming 'df' is your DataFrame.
df['only_private_insurance'] = ~df['public_insurance_2114'].str.contains('public', case=False, na=False)

In [116]:
# prompt: create a column maternal_language_english for any row where maternal_language is english

# Assuming 'df' is your DataFrame.
df['maternal_language_english'] = df['maternal_language_2113'] == 'english'

In [117]:
# prompt: create a column marital_status_married_or_partner for any row where marital_status_2184 is 'married' or 'partner'

# Assuming 'df' is your DataFrame.
df['marital_status_married_or_partner'] = df['marital_status_2184'].apply(lambda x: True if x in ['married', 'partner'] else False)

In [118]:
# prompt: create a column country_of_origin_USA that is country_of_origin_2186 == united states

# Assuming 'df' is your DataFrame.
df['country_of_origin_USA'] = df['country_of_origin_2186'] == 'united states'

In [119]:
# prompt: create a column employment_status_fulltime that is employment_status_2187 == full time

df['employment_status_fulltime'] = df['employment_status_2187'] == 'full time'

In [120]:
composite_SES_columns = [
    "only_private_insurance",
    "maternal_language_english",
    "marital_status_married_or_partner",
    "country_of_origin_USA",
    "employment_status_fulltime",
]
df['composite_SES_advantage'] = df[composite_SES_columns].all(axis=1)

In [121]:
# prompt: create a column epidural_needle_type based on anes_procedure_epidural_needle_2263 that can have values "tuohy","weiss", or "other"

# Create the 'epidural_needle_type' column based on 'anes_procedure_epidural_needle_2263'
df['epidural_needle_type'] = df['anes_procedure_epidural_needle_2263'].map({
    'tuohy': 'tuohy',
    'weiss': 'weiss',
}).fillna('other')

In [122]:
# prompt: create a column paresthesias_present that is anes_procedure_paresthesias_2270 either "transient" or "yes"

# Create the 'paresthesias_present' column
df['paresthesias_present'] = df['anes_procedure_paresthesias_2270'].apply(lambda x: True if x == 'yes' or x == 'transient' else False)

In [123]:
df['delivery_site'] = np.where(df['delivery_site_2188'] == 'mgb', np.nan, df['delivery_site_2188'])

In [None]:
df['delivery_site'].value_counts(dropna=False)

In [125]:
df['labor_induction'] = df[[
    'induction_oxytocin_2189','induction_cervical_balloon_2190','induction_misoprostol_2191','induction_arom_2192','induction_foley_easy_2193']].any(axis=1)

## Create a new unique identifier based on epic_pmrn

In [126]:
# Define identifier range (6-digit numbers)
id_len = 8
min_id, max_id = 10**(id_len-1), 10**id_len - 1

# Create mapping of unique MRNs to unique random identifiers
unique_mrns = df['epic_pmrn'].unique()
mapping = dict(zip(unique_mrns, random.sample(range(min_id, max_id+1), len(unique_mrns))))

# Map to a new column in DataFrame
df['unique_pt_id'] = df['epic_pmrn'].map(mapping)

# Save processed data prior to analysis

In [127]:
complete_data = df.copy()

In [128]:
# Save the DataFrame to a pickle file
complete_data.to_pickle(my_computer_fpath + "processed_merlin_data.pkl")

In [129]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
import random

my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
# my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

In [None]:
# Load the pickled DataFrame
complete_data = pd.read_pickle(my_computer_fpath + "processed_merlin_data.pkl")

# Now you can work with the DataFrame
complete_data.head()

In [131]:
df = complete_data.copy()

# Reduce Table to Chosen Features

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

# Assuming 'df' is your DataFrame (as defined in the provided code)
all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

In [133]:
chosen_features = [
#    "id",
    "unique_pt_id",
    "anes_procedure_encounter_id_2273",
    "gestational_age_2052",
    "delivery_site",
    "baby_weight_2196",
    "rom_thru_delivery_hours",
    "fetal_presentation_category_2243",
    "fetal_presentation_position_2247",
    "bmi_end_pregnancy_2044",
    "maternal_weight_end_pregnancy_2045",
    "bmi_before_pregnancy_2161",
#    "zipcode_2185",
    "gravidity_2047",
    "parity_2048",
#    "anes_procedure_note_text_2271",
#    "best_timestamp",
#    "true_procedure_type",
    "is_neuraxial_catheter",
    "failed_catheter",
#    "dpe",
    "lor_depth",
    "current_resident_catheter_count",
    "highly_experienced_anesthesiologist",
    "highly_experienced_resident",
    "current_anesthesiologist_catheter_count",
    "moderately_experienced_anesthesiologist",
    "has_scoliosis",
    "has_dorsalgia",
    "has_back_problems",
    "maternal_race",
 #   "prior_pain_scores",
    "prior_pain_scores_max",
    "composite_psychosocial_problems",
    "only_private_insurance",
    "maternal_language_english",
    "marital_status_married_or_partner",
    "country_of_origin_USA",
    "employment_status_fulltime",
    "composite_SES_advantage",
    "epidural_needle_type",
    "paresthesias_present",
    "number_of_neuraxial_attempts",
    "prior_failed_catheters_this_enc",
    "prior_failed_catheters_prev_enc",
    "prior_all_catheters_all_enc",
    "true_procedure_type_incl_dpe",
    "maternal_age_years",
    "placement_to_delivery_hours",
    "labor_induction"
]

In [134]:
df = df[chosen_features]

In [None]:
df = df.replace({True: 1, False: 0})

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

# Download

In [137]:
df.to_csv(my_computer_fpath + 'minimal_merlin_data.csv', index=False)