# Imports

In [4]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt


In [5]:
file_path = 'C:\\Users\\dfber\\Downloads\\e26f9ccc-68a4-42b4-9d0d-508a83026a1c.csv'
raw_df = pd.read_csv(file_path)
raw_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\dfber\\Downloads\\e26f9ccc-68a4-42b4-9d0d-508a83026a1c.csv'

# Initialize dataframe

In [12]:
df = raw_df.copy()

In [None]:
pd.set_option("display.max_columns", None)
df.head()

# Initial Data Cleaning

## Explode |-separated notes

In [None]:
# Expand the items in anes_procedure_cols separated by "|" into a separate row
# Requires that within a row, each element in these columns has the same number of |-separated values

anes_procedure_cols = ['anes_procedure_type_2253', 'anes_procedure_start_dts_2254', 'anes_procedure_anesthesiologist_2255', 'anes_procedure_resident_2256', 'anes_procedure_pt_position_2257', 'anes_procedure_approach_2258', 'anes_procedure_location_2259', 'anes_procedure_note_id_2260', 'anes_procedure_dos_dts_2261', 'anes_procedure_dpe_2262', 'anes_procedure_epidural_needle_2263', 'anes_procedure_epidural_needle_gauge_2264', 'anes_procedure_lor_depth_2265', 'anes_procedure_catheter_depth_2266', 'anes_procedure_spinal_needle_type_2267', 'anes_procedure_spinal_needle_gauge_2268', 'anes_procedure_spinal_needle_length_2269', 'anes_procedure_paresthesias_2270', 'anes_procedure_note_text_2271','anes_procedure_encounter_id_2273']

# Split the columns with '|' delimiter
for col in anes_procedure_cols:
    df[col] = df[col].str.split('\|')

# Explode the DataFrame
df = df.explode(anes_procedure_cols)

# Reset the index after exploding the DataFrame so each individual note will be its own unique row and index
df = df.reset_index(drop=True)
df[['id','anes_procedure_type_2253']].head(20)

## Handle datetime issues

Bug: Merlin is bringing anes_procedure_dos_dts_2261 as Eastern times when in fact they are UTC. I resolve this by editing the raw strings before conversion to datetime objects.

Bug: Merlin ignores AM/PM in anes_procedure_start_dts_2254 and assumes all entries are AM. I resolve this (for now) by ignoring these written start times and just using dos_dts

In [None]:
df['anes_procedure_dos_dts_2261'].head()

In [16]:
df['dos_dts_tz_stripped'] = df['anes_procedure_dos_dts_2261'].str.replace(r'[+-]\d{4}$', '+0000', regex=True)

In [None]:
df['dos_dts_tz_stripped'].head()

In [18]:
df['dos_dts'] = pd.to_datetime(df['dos_dts_tz_stripped'])

In [None]:
df[['dos_dts','anes_procedure_dos_dts_2261']].head()

In [20]:
df['start_dts'] = pd.to_datetime(df['anes_procedure_start_dts_2254'],format='mixed',utc=True)

In [None]:
# prompt: df['start_dts'].max() but ignore the date, look only at the time

# Extract the time part of the 'start_dts' column
df[df['start_dts'].notna()]['start_dts'].dt.time.sort_values()


In [22]:
# This code has been changed to avoid the AM/PM bug

# df['best_timestamp'] = df['start_dts'].fillna(df['dos_dts'])
df['best_timestamp'] = df['dos_dts']

## Handle near-duplicate notes

In [None]:
# test behavior on a known double-note
df.loc[df['anes_procedure_note_id_2260'] == '1188076153']

In [24]:
# test behavior on a known near-duplicate note
df[df['anes_procedure_note_id_2260'] == '2250605132']
known_near_duplicate_encounter_id = df[df['anes_procedure_note_id_2260'] == '2250605132']['anes_procedure_encounter_id_2273'].iloc[0]


In [None]:
known_near_duplicate_group = df.groupby('anes_procedure_encounter_id_2273').get_group(known_near_duplicate_encounter_id)
known_near_duplicate_group

In [26]:
# prompt: add 'best_timestamp', 'dos_dts', and 'start_dts' to anes_procedure_cols

anes_procedure_cols.extend(['best_timestamp', 'dos_dts', 'start_dts'])

In [27]:
# need to narrow operations to a smaller group of columns for efficiency

df_anes_procedure_cols = df[anes_procedure_cols]

In [28]:
# Functions to label near_duplicate procedures

# Compare two rows and return True if their timestamps are within minute_offset
# and their compare_cols match
def check_if_near_duplicate(row1, row2, compare_cols, minute_offset):
  for col in compare_cols:
    if not pd.isnull(row1[col]) and not pd.isnull(row2[col]):
      if row1[col] != row2[col]:
        return False
  if abs(row1['best_timestamp'] - row2['best_timestamp']) > pd.Timedelta(minutes=minute_offset):
    return False
  return True


# Label near_duplicate notes within an encounter using the check_if_near_duplicate function
def label_near_duplicate_notes(encounter):

  indices = encounter.index.tolist()

  for i in range(len(indices)):
    base_idx = indices[i]
    base_row = encounter.loc[base_idx]
    has_near_duplicate = 0
    near_duplicates = [base_row['anes_procedure_note_id_2260']]

    for j in range(len(indices)):
      if i == j:
        continue # don't identify self-duplicates
      compare_idx = indices[j]
      compare_row = encounter.loc[compare_idx]


      if check_if_near_duplicate(base_row, compare_row, ['anes_procedure_type_2253'], minute_offset = 60):
        has_near_duplicate = 1
        near_duplicates.append(compare_row['anes_procedure_note_id_2260'])

    encounter.at[base_idx, 'has_near_duplicate'] = has_near_duplicate
    encounter.at[base_idx, 'near_duplicate_note_ids'] = str(sorted(near_duplicates))

  return encounter


In [None]:
# Label near_duplicate procedures
# Takes ~2 mins

df_anes_procedure_cols['has_near_duplicate'] = 0
df_anes_procedure_cols['near_duplicate_note_ids'] = None
df_anes_procedure_cols = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').apply(label_near_duplicate_notes, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('anes_procedure_encounter_id_2273')

In [30]:
# prompt: sort df_anes_procedure_cols by index

df_anes_procedure_cols = df_anes_procedure_cols.sort_index()

In [31]:
# Count blank columns
df_anes_procedure_cols['blank_anes_procedure_element_col_counts'] = df_anes_procedure_cols[anes_procedure_cols].isnull().sum(axis=1)

In [32]:
# Within a group of duplicates, label the one with the fewest blank columns as NOT the worse duplicate (i.e., the best)
# Takes ~2 mins
def label_worse_near_duplicates(near_duplicate_set):
  near_duplicate_set.at[near_duplicate_set['blank_anes_procedure_element_col_counts'].idxmin(), 'is_worse_near_duplicate'] = 0
  return near_duplicate_set

df_anes_procedure_cols['is_worse_near_duplicate'] = df_anes_procedure_cols['has_near_duplicate']
df_anes_procedure_cols = df_anes_procedure_cols.groupby('near_duplicate_note_ids').apply(label_worse_near_duplicates, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('near_duplicate_note_ids')

In [None]:
known_near_duplicate_group = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').get_group(known_near_duplicate_encounter_id)
known_near_duplicate_group

In [None]:
# test behavior on a known double-note
known_double_note = df_anes_procedure_cols.loc[df_anes_procedure_cols['anes_procedure_note_id_2260'] == '1188076153']
known_double_note

In [None]:
df_anes_procedure_cols['is_worse_near_duplicate'].value_counts()

In [36]:
# Remove worse duplicates
df_anes_procedure_cols = df_anes_procedure_cols[df_anes_procedure_cols['is_worse_near_duplicate']==0]

## Address cases where an epidural note followed by a spinal note is actually a planned CSE, not a failed catheter. Also address what 'epidural/intrathecal' really means.

Secret CSEs are spinal and epidural within 5 mins

Epidural/intrathecal notes are declared epidural unless ***

In [37]:
# Functions to label secret_CSE procedures

# Compare two rows and return True if exactly one is an epidural, exactly one is a spinal,
# and if their timestamps are within minute_offset
def check_if_secret_CSE(row1, row2, minute_offset):
  if abs(row1['best_timestamp'] - row2['best_timestamp']) < pd.Timedelta(minutes=minute_offset):
    if row1['anes_procedure_type_2253'] == 'epidural/intrathecal' or row1['anes_procedure_type_2253'] == 'epidural':
      if row2['anes_procedure_type_2253'] == 'spinal':
        return True
    if row2['anes_procedure_type_2253'] == 'epidural/intrathecal' or row2['anes_procedure_type_2253'] == 'epidural':
      if row1['anes_procedure_type_2253'] == 'spinal':
        return True
  return False

# Label secret_CSE notes within an encounter using the check_if_secret_CSE function
def label_secret_CSE_notes(encounter):

  indices = encounter.index.tolist()

  for i in range(len(indices)):
    base_idx = indices[i]
    base_row = encounter.loc[base_idx]
    is_secret_CSE = 0
    secret_CSEs = [base_row['anes_procedure_note_id_2260']]

    for j in range(len(indices)):
      if i == j:
        continue # don't identify self-duplicates
      compare_idx = indices[j]
      compare_row = encounter.loc[compare_idx]

      if check_if_secret_CSE(base_row, compare_row, minute_offset = 5):
        is_secret_CSE = 1
        secret_CSEs.append(compare_row['anes_procedure_note_id_2260'])

    encounter.at[base_idx, 'is_secret_CSE'] = is_secret_CSE
    encounter.at[base_idx, 'secret_CSE_note_ids'] = str(sorted(secret_CSEs))

  return encounter


In [None]:
# Label secret_CSE procedures
# Takes ~2 mins

df_anes_procedure_cols['is_secret_CSE'] = 0
df_anes_procedure_cols['secret_CSE_note_ids'] = None
df_anes_procedure_cols = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').apply(label_secret_CSE_notes, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('anes_procedure_encounter_id_2273')

In [None]:
df_anes_procedure_cols['is_secret_CSE'].value_counts()

In [None]:
df_anes_procedure_cols[df_anes_procedure_cols['is_secret_CSE'] == 1].head(10)

In [41]:
# Eliminate the separately-documented spinals that are really part of CSEs

# Delete rows where procedure_type is spinal and is_secret_CSE is true
df_anes_procedure_cols = df_anes_procedure_cols[~((df_anes_procedure_cols['anes_procedure_type_2253'] == 'spinal') & (df_anes_procedure_cols['is_secret_CSE'] == 1))]

In [None]:
# Label true intrathecal catheters
# NOTE: DOES NOT YET RECLASSIFY EPIDURAL/INTRATHECALS BY CSF ASPIRATION OR ANY OTHER METHOD

df_anes_procedure_cols['is_intrathecal_catheter'] = (df_anes_procedure_cols['anes_procedure_type_2253'] == 'intrathecal').astype(int)

In [None]:
# prompt: label true_procedure_type by reclassifying based on is_secret_CSE and is_intrathecal_catheter

# Create the 'true_procedure_type' column based on the conditions
df_anes_procedure_cols['true_procedure_type'] = np.where(
    df_anes_procedure_cols['is_secret_CSE'] == True,'cse',
    df_anes_procedure_cols['anes_procedure_type_2253'])

# Update 'true_procedure_type' based on 'is_intrathecal_catheter'
df_anes_procedure_cols.loc[
    (df_anes_procedure_cols['true_procedure_type'].isin(['epidural/intrathecal', 'intrathecal'])) &
    (df_anes_procedure_cols['is_intrathecal_catheter'] == True),
    'true_procedure_type'] = 'intrathecal'

df_anes_procedure_cols.loc[
    (df_anes_procedure_cols['true_procedure_type'] == 'epidural/intrathecal') &
    (df_anes_procedure_cols['is_intrathecal_catheter'] == False),
    'true_procedure_type'] = 'epidural'

In [None]:
df_anes_procedure_cols.head()

# Classify failures

In [45]:
df_anes_procedure_cols['is_neuraxial_catheter'] = (df_anes_procedure_cols['true_procedure_type'].isin(['cse', 'epidural', 'intrathecal'])).astype(int)
df_anes_procedure_cols['is_spinal'] = (df_anes_procedure_cols['true_procedure_type'] == 'spinal').astype(int)
df_anes_procedure_cols['is_airway'] = (df_anes_procedure_cols['true_procedure_type'] == 'airway').astype(int)

In [46]:
# Vectorized method to classify as successes or failures
# takes ~10 mins

def classify_encounter_failures(encounter):

    # Identify rows where 'is_neuraxial_catheter' == 1
    neuraxial_rows = encounter[encounter['is_neuraxial_catheter'] == 1]

    # If no neuraxial catheter procedures, return encounter as is
    if neuraxial_rows.empty:
        return encounter

    # Create a mask for failure-defining events within the encounter
    # Failure-defining events are neuraxial catheters, spinals, and airways
    failure_defining_event_mask = encounter[['is_neuraxial_catheter','is_spinal','is_airway']].any(axis=1)

    # Get the indices of events
    failure_defining_event_indices = encounter.index[failure_defining_event_mask]

    # Iterate over neuraxial catheter rows
    for idx in neuraxial_rows.index:
        current_time = encounter.at[idx, 'best_timestamp']

        # Find subsequent events
        # This relies on correct ordering by best_timestamp
        subsequent_failure_defining_events = encounter.loc[failure_defining_event_indices]
        subsequent_failure_defining_events = subsequent_failure_defining_events[subsequent_failure_defining_events['best_timestamp'] > current_time]

        # Initialize flags
        has_subsequent_neuraxial_catheter = 0
        has_subsequent_spinal = 0
        has_subsequent_airway = 0
        failed_catheter = 0
        subsequent_proof_of_failure_note_id = None

        # Check for subsequent procedures
        if not subsequent_failure_defining_events.empty:
            # Update flags based on any occurrence in subsequent events
            has_subsequent_neuraxial_catheter = int((subsequent_failure_defining_events['is_neuraxial_catheter'] == 1).any())
            has_subsequent_spinal = int((subsequent_failure_defining_events['is_spinal'] == 1).any())
            has_subsequent_airway = int((subsequent_failure_defining_events['is_airway'] == 1).any())
            failed_catheter = int(has_subsequent_neuraxial_catheter or has_subsequent_spinal or has_subsequent_airway)
            subsequent_proof_of_failure_note_id = subsequent_failure_defining_events['anes_procedure_note_id_2260'].tolist()

            encounter.at[idx, 'has_subsequent_neuraxial_catheter'] = has_subsequent_neuraxial_catheter
            encounter.at[idx, 'has_subsequent_spinal'] = has_subsequent_spinal
            encounter.at[idx, 'has_subsequent_airway'] = has_subsequent_airway
            encounter.at[idx, 'failed_catheter'] = failed_catheter
            encounter.at[idx, 'subsequent_proof_of_failure_note_id'] = str(subsequent_proof_of_failure_note_id)

    return encounter

df_anes_procedure_cols['has_subsequent_neuraxial_catheter'] = 0
df_anes_procedure_cols['has_subsequent_spinal'] = 0
df_anes_procedure_cols['has_subsequent_airway'] = 0
df_anes_procedure_cols['failed_catheter'] = 0
df_anes_procedure_cols['subsequent_proof_of_failure_note_id'] = None

df_anes_procedure_cols = df_anes_procedure_cols.groupby('anes_procedure_encounter_id_2273').apply(classify_encounter_failures, include_groups = False)
df_anes_procedure_cols = df_anes_procedure_cols.reset_index('anes_procedure_encounter_id_2273')

In [None]:
df_anes_procedure_cols.head(10)

In [48]:
# prompt: concatenate new columns from df_anes_procedure_cols into df. only bring the new columns, leave behind the matching ones. Select the new columns via code.

# Identify new columns in df_anes_procedure_cols that are not in df
new_cols = [col for col in df_anes_procedure_cols.columns if col not in df.columns]

# Concatenate only the new columns from df_anes_procedure_cols to df
df = pd.concat([df, df_anes_procedure_cols[new_cols]], axis=1)

In [49]:
df['is_neuraxial_catheter'] = df['is_neuraxial_catheter'] == 1
df['failed_catheter'] = df['failed_catheter'] == 1

In [None]:
df.head()

In [None]:
# test behavior on a known double-note
known_double_note = df.loc[df['anes_procedure_note_id_2260'] == '1188076153']
known_double_note

In [None]:
df[df['failed_catheter'] == 1].head(10)

In [53]:
known_failed_catheter_encounter_ids = ['3259099621','3081317750', '3081399139', '3081675427', '3081686082', '3081711691', '3081729928', '3081884584', '3081893356', '3082275619', '3082349091']

In [None]:
df[df['anes_procedure_encounter_id_2273'].isin(known_failed_catheter_encounter_ids)]

# Additional Data Cleaning and Feature Engineering

## Handle timeseries data (e.g., pain scores)

In [None]:
# Extracts the pain scores prior to the timestamp
# Takes ~ 1 minute
def get_pain_scores_prior_to_timestamp(row, best_timestamp_col="best_timestamp"):
    """
    Extract all pain scores that have timestamp < row[best_timestamp_col].

    row: a single row of your DataFrame (a pd.Series)
    best_timestamp_col: name of the column in your DataFrame that contains
                       the 'best_timestamp' to compare against

    Returns a list of 'prior' scores or NaN if none exist.
    """
    # Extract the raw strings
    times_str = row["timeseries_intrapartum_pain_score_datetime_2242"]
    scores_str = row["timeseries_intrapartum_pain_score_2242"]

    # If either is missing, return NaN
    if pd.isna(times_str) or pd.isna(scores_str):
        return np.nan

    # Convert to lists
    times_list = times_str.split("|")
    scores_list = scores_str.split("|")

    # Safely convert both times and best_timestamp to datetime
    try:
        times_dt = pd.to_datetime(times_list)
        # This assumes your row also has a column called best_timestamp_col
        best_dt = pd.to_datetime(row[best_timestamp_col])
    except:
        # If conversion fails, return NaN
        return np.nan

    # Filter out all scores whose timestamp is strictly less than best_timestamp
    prior_scores = []
    for t, s in zip(times_dt, scores_list):
        if t < best_dt:
            prior_scores.append(float(s))

    # If no scores remain, return NaN, else return them joined or as list
    return prior_scores if prior_scores else np.nan

df['prior_pain_scores'] = df.apply(get_pain_scores_prior_to_timestamp, axis=1)

In [57]:
df["prior_pain_scores_max"] = df["prior_pain_scores"].apply(
    lambda scores: max(map(float, scores)) if isinstance(scores, list) and scores else np.nan)

In [None]:
df['prior_pain_scores_max'].head(50)

## Clean DPE and LOR_Depth

In [59]:
# make 'dpe' True/False
df['dpe'] = df['anes_procedure_dpe_2262'] == 'yes'

In [60]:
# make 'lor_depth' numeric
df['lor_depth'] = df['anes_procedure_lor_depth_2265'].replace('', np.nan).astype(float)

In [None]:
# Code to evaluate suspiciously high LORs
# For these, if we divide LOR by 10, the the catheter is taped around 4-5 cm deeper
# So most likely these suspiciously high LORs are missing decimal points
high_LORs = df.sort_values(by='lor_depth',ascending=False).head(100)['lor_depth']
print(high_LORs.to_list())
plt.hist(high_LORs)

print(df.sort_values(by='lor_depth',ascending=False).head(100)['anes_procedure_catheter_depth_2266'].to_list())

In [62]:
# prompt: lor_depth = lor_depth / 10 if lor_depth > 20

df['lor_depth'] = np.where(df['lor_depth'] > 20, df['lor_depth'] / 10, df['lor_depth'])

In [None]:
# Code to evaluate suspiciously high LORs
high_LORs = df.sort_values(by='lor_depth',ascending=False).head(100)['lor_depth']
print(high_LORs.to_list())
plt.hist(high_LORs)

## Make numerical columns numerical and plausible

In [64]:
# prompt: set these columns to dtype float: bmi_end_pregnancy_2044, maternal_weight_end_pregnancy_2045, maternal_height_2046,gravidity_2047,parity_2048

# Convert specified columns to float dtype
columns_to_convert = ['gestational_age_2052','bmi_end_pregnancy_2044', 'maternal_weight_end_pregnancy_2045', 'maternal_height_2046', 'gravidity_2047', 'parity_2048','baby_weight_2196','bmi_before_pregnancy_2161','secs_rom_thru_delivery_2197']

for col in columns_to_convert:
    if col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
        except KeyError:
            print(f"Column '{col}' not found in the DataFrame.")
    else:
        print(f"Column '{col}' not found in the DataFrame.")

In [65]:
# If ROM through Delivery is more than 30 days, assume erroneous and make it NaN
df['rom_thru_delivery_hours'] = df['secs_rom_thru_delivery_2197'] / 3600
df['rom_thru_delivery_hours'] = np.where(df['rom_thru_delivery_hours'] <= 30*24, df['rom_thru_delivery_hours'],np.nan)

Include other limits on plausible data for each feature

## Handle proceduralist names

In [66]:
# Function to regulate names
def regulate_name(name):

    # Remove degrees and titles
    name = re.sub(r',?\s*(md|do|mbbs|phd|ms|mba|mph|msc|crna)\b', '', name, flags=re.IGNORECASE)

    # Split last name and first name if comma exists
    if ',' in name:
        last, first = name.split(',', 1)
        name = f"{first.strip()} {last.strip()}"

    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()

    # Remove middle names
    parts = name.split()
    if len(parts) > 2 :
      name = f"{parts[0]} {parts[-1]}"

    # Capitalize each part of the name
    name = name.title()

    return name

# Apply the function to regulate names
df['Regulated_Anesthesiologist_Name'] = df['anes_procedure_anesthesiologist_2255'].dropna().apply(regulate_name)
df['Regulated_Resident_Name'] = df['anes_procedure_resident_2256'].dropna().apply(regulate_name)

In [67]:
# prompt: set all blank Regulated_Anesthesiologist_Name and Regulated_Resident_Name to NaN

df['Regulated_Anesthesiologist_Name'] = df['Regulated_Anesthesiologist_Name'].replace('', np.nan)
df['Regulated_Resident_Name'] = df['Regulated_Resident_Name'].replace('', np.nan)

In [71]:
# prompt: For each catheter, count how many (i.e., earlier best_timestamp) catheters were done by that provider (including the current one)

df = df.sort_values('best_timestamp')

df['current_anesthesiologist_catheter_count'] = (
    df.groupby('Regulated_Anesthesiologist_Name')['is_neuraxial_catheter']
      .cumsum()
)

df['current_resident_catheter_count'] = (
    df.groupby('Regulated_Resident_Name')['is_neuraxial_catheter']
      .cumsum()
)

In [76]:
df['highly_experienced_anesthesiologist'] = np.where(df['current_anesthesiologist_catheter_count'] > 500, 'yes',
                                                    np.where(df['current_anesthesiologist_catheter_count'] <= 500, 'no', 'none'))

In [77]:
df['moderately_experienced_anesthesiologist'] = np.where(df['current_anesthesiologist_catheter_count'] > 100, 'yes',
                                                        np.where(df['current_anesthesiologist_catheter_count'] <= 100, 'no', 'none'))

In [78]:
# prompt: set df['highly_experienced_resident'] to 1 if current_resident_catheter_count > 50, to 0 if <= 50, and to -1 if NaN

df['highly_experienced_resident'] = np.where(df['current_resident_catheter_count'] > 50, 'yes',
                                                    np.where(df['current_resident_catheter_count'] <= 50, 'no', 'none'))

## Feature engineering on categorical variables

In [82]:
df['has_scoliosis'] = df['icd_scoliosis_2053'] == True

In [84]:
df['has_dorsalgia'] = df['icd_dorsalgia_2104'] == True

In [86]:
# prompt: create a column "has_back_problems" that is 1 where any of the following are True, else 0. Handle NaN.

# Define the columns related to back problems
back_problem_cols = [
    'icd_scoliosis_2053',
    'icd_spinal_fusion_2056',
    'icd_congenital_deformity_spine_2059',
    'icd_ra_and_sctds_2086',
    'icd_kyphosis_and_lordosis_2089',
    'icd_spinal_osteochondrosis_2092',
    'icd_spondylopathies_and_deforming_dorsopathies_2095',
    'icd_intervertebral_disc_disorders_2098',
    'icd_ehlers_minus_danlos_2101',
]

# Note that spondyolopathies_and_deforming_dorsopathies are by far the biggest contributors

# Create the 'has_back_problems' column
df['has_back_problems'] = df[back_problem_cols].any(axis=1)

In [88]:
df['maternal_race'] = np.select(
    [
        df['maternal_race_2111'] == 'White',
        df['maternal_race_2111'] == 'Asian',
        df['maternal_race_2111'] == 'Black'
    ],
    [
        'White',
        'Asian',
        'Black'
    ],
    default='Other/Unknown'
)

In [90]:
composite_social_columns = [
    "drug_abuse_during_parent_2144",
    "high_risk_social_problems_parent_2154",
    "high_risk_insufficient_antenatal_care_parent_2157",
    "icd_major_mental_health_disorder_2178",
    "education_problems_2203",
    "employment_problems_2206",
    "adverse_occupational_2209",
    "housing_problems_2212",
    "adjustment_problems_2215",
    "relationship_problems_2218",
    "other_psychosocial_2221",
    "smoker_during_pregnancy_parent_2117",
    "drug_abuse_before_parent_2142",
    "alcohol_during_parent_2147",
]

df['composite_psychosocial_problems'] = df[composite_social_columns].any(axis=1)

In [92]:
# prompt: create column 'any_public_insurance' for any row where public_insurance_2114 contains the string "public", ignore case

# Assuming 'df' is your DataFrame.
df['any_public_insurance'] = df['public_insurance_2114'].str.contains('public', case=False, na=False)

In [94]:
# prompt: create a column maternal_language_english for any row where maternal_language is english

# Assuming 'df' is your DataFrame.
df['maternal_language_english'] = df['maternal_language_2113'] == 'english'

In [97]:
# prompt: create a column marital_status_married_or_partner for any row where marital_status_2184 is 'married' or 'partner'

# Assuming 'df' is your DataFrame.
df['marital_status_married_or_partner'] = df['marital_status_2184'].apply(lambda x: True if x in ['married', 'partner'] else False)

In [100]:
# prompt: create a column country_of_origin_USA that is country_of_origin_2186 == united states

# Assuming 'df' is your DataFrame.
df['country_of_origin_USA'] = df['country_of_origin_2186'] == 'united states'

In [103]:
# prompt: create a column employment_status_fulltime that is employment_status_2187 == full time

df['employment_status_fulltime'] = df['employment_status_2187'] == 'full time'

In [106]:
# prompt: create a column epidural_needle_type based on anes_procedure_epidural_needle_2263 that can have values "tuohy","weiss", or "other"

# Create the 'epidural_needle_type' column based on 'anes_procedure_epidural_needle_2263'
df['epidural_needle_type'] = df['anes_procedure_epidural_needle_2263'].map({
    'tuohy': 'tuohy',
    'weiss': 'weiss',
}).fillna('other')

In [109]:
# prompt: create a column paresthesias_present that is anes_procedure_paresthesias_2270 either "transient" or "yes"

# Create the 'paresthesias_present' column
df['paresthesias_present'] = df['anes_procedure_paresthesias_2270'].apply(lambda x: True if x == 'yes' or x == 'transient' else False)

# Save processed data prior to analysis

In [117]:
complete_data = df.copy()

In [118]:
# Save the DataFrame to a pickle file
complete_data.to_pickle("C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\processed_merlin_data.pkl")

In [3]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt


In [5]:
# Load the pickled DataFrame
# complete_data = pd.read_pickle("C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\processed_merlin_data.pkl")
complete_data = pd.read_pickle("C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\processed_merlin_data.pkl")

# Now you can work with the DataFrame
complete_data.head()

Unnamed: 0,id,epic_pmrn,delivery_date,delivery_time,pregnancy_outcome,newborn_gestational_age,gestational_age_2052,anesthesia_epidural_2062,anesthesia_general_2063,anesthesia_local_2064,...,has_back_problems,maternal_race,composite_psychosocial_problems,any_public_insurance,maternal_language_english,marital_status_married_or_partner,country_of_origin_USA,employment_status_fulltime,epidural_needle_type,paresthesias_present
112804,103917,10041002948,2015-07-29,16:01:11-04:00,term,277.0,277.0,True,False,False,...,False,Black,False,True,True,False,False,False,other,False
61731,56802,10093675293,2015-07-30,15:40:00-04:00,term,289.0,289.0,True,False,False,...,False,Other/Unknown,False,False,True,False,False,False,other,False
78152,71970,10047033192,2015-07-29,13:54:00-04:00,term,274.0,274.0,True,False,False,...,False,White,False,False,True,True,True,True,other,False
53664,49385,10021864456,2015-07-29,02:39:00-04:00,term,261.0,261.0,True,False,False,...,False,Black,False,False,True,False,False,True,other,False
118656,109315,10054753336,2015-07-29,02:47:27-04:00,term,293.0,293.0,True,False,False,...,False,White,False,False,True,True,True,False,other,False


In [6]:
df = complete_data.copy()

# Manually analyze some successes and failures

In [7]:
# prompt: Choose 10 random failed_catheters and 10 random non-failed_catheters

# Assuming 'df' is your DataFrame and it contains a column 'failed_catheter'
failed_catheters = df[df['failed_catheter'] == 1]
non_failed_catheters = df[df['failed_catheter'] == 0]

# Randomly choose 10 failed catheters
random_failed_catheters = failed_catheters.sample(n=10, random_state=42)  # random_state for reproducibility
chosen_failed_catheter_encounter_ids = ['3324914343','3272008150','3234765502','3305371022','3216449190','3186345033','3493903332','3285273066','3320528828','3191160118']
chosen_failed_catheters = df[df['anes_procedure_encounter_id_2273'].isin(chosen_failed_catheter_encounter_ids)]

# Randomly choose 10 non-failed catheters
random_non_failed_catheters = non_failed_catheters.sample(n=10, random_state=42) # random_state for reproducibility

In [8]:
column_names = [
    "anes_procedure_encounter_id_2273",
    "best_timestamp",
    "failed_catheter",
    "true_procedure_type",
    "anes_procedure_note_id_2260",
    "subsequent_proof_of_failure_note_id",
    "Regulated_Anesthesiologist_Name",
    "Regulated_Resident_Name"
]

In [9]:
random_failed_catheters[column_names]

Unnamed: 0,anes_procedure_encounter_id_2273,best_timestamp,failed_catheter,true_procedure_type,anes_procedure_note_id_2260,subsequent_proof_of_failure_note_id,Regulated_Anesthesiologist_Name,Regulated_Resident_Name
5730,3180441306,2018-01-28 01:46:00+00:00,True,epidural,1769250967,['1769313978'],,
38718,3474946221,2022-12-04 16:58:00+00:00,True,epidural,8528783093,['8532304936'],Chih King,Philipp Gerner
6326,3333764157,2020-12-26 04:02:00+00:00,True,epidural,4759071558,['4759569951'],Erin Haggerty,Maximilian Lang
86382,3488132286,2023-02-06 08:55:00+00:00,True,epidural,8878690071,"['8878992998', '8881177488', '8884025863']",Ken Solt,Shaunte Butler
134025,3150082575,2017-05-19 02:26:00+00:00,True,epidural,1447039096,['1447040650'],David Hepner,
47648,3422259685,2022-03-21 03:05:00+00:00,True,epidural,7129707169,['7129768418'],Arthur Chyan,Alexander Yue
129882,3281763503,2020-01-20 21:43:00+00:00,True,epidural,3277562408,['3278247297'],,
147049,3526924968,2023-08-02 17:31:00+00:00,True,epidural,9878477118,['9878519614'],Kyle Jespersen,
74417,3244702101,2019-05-31 02:28:00+00:00,True,epidural,2394024005,['2394052236'],Kyle Byrne,
107023,3480644898,2023-01-01 01:30:00+00:00,True,epidural,8681489228,"['8683322534', '8685649234']",Zhongcong Xie,


In [10]:
chosen_failed_catheters[column_names]

Unnamed: 0,anes_procedure_encounter_id_2273,best_timestamp,failed_catheter,true_procedure_type,anes_procedure_note_id_2260,subsequent_proof_of_failure_note_id,Regulated_Anesthesiologist_Name,Regulated_Resident_Name
90428,3186345033,2018-03-19 06:39:00+00:00,True,epidural,1827698916,['1827810492'],Tong-Yan Chen,Kayla Florio
90427,3186345033,2018-03-19 08:17:00+00:00,False,cse,1827810492,,Tong-Yan Chen,
33948,3191160118,2018-04-26 14:12:00+00:00,False,spinal,1875519865,,Mihaela Podovei,
33949,3191160118,2018-04-26 15:57:00+00:00,True,epidural,1875532828,['1875559634'],Mihaela Podovei,
33947,3191160118,2018-04-26 19:24:00+00:00,False,epidural,1875559634,,Mihaela Podovei,
39816,3216449190,2018-11-06 18:26:00+00:00,True,epidural,2126903072,"['2127240380', '2127290497']",,
39817,3216449190,2018-11-07 05:35:00+00:00,True,epidural,2127240380,['2127290497'],Laura Chang,
39814,3216449190,2018-11-07 07:39:00+00:00,False,cse,2127290497,,Laura Chang,
26063,3234765502,2019-03-25 05:11:00+00:00,True,epidural,2302729884,['2306197611'],Eric Cappiello,Yun-Yun Chen
26064,3234765502,2019-03-27 08:45:00+00:00,False,epidural,2306197611,,Naida Cole,


In [11]:
random_non_failed_catheters[column_names]

Unnamed: 0,anes_procedure_encounter_id_2273,best_timestamp,failed_catheter,true_procedure_type,anes_procedure_note_id_2260,subsequent_proof_of_failure_note_id,Regulated_Anesthesiologist_Name,Regulated_Resident_Name
109392,3536338018.0,2023-09-11 19:55:00+00:00,False,epidural,10100448245.0,,Eric Cappiello,
101855,3149917647.0,2017-05-16 12:46:00+00:00,False,cse,1445285541.0,,Nikolai Gonzales,
4519,3141969762.0,2017-03-08 09:13:00+00:00,False,epidural,1372346488.0,,Bhavani Kodali,
75956,3207897823.0,2018-09-03 04:28:00+00:00,False,epidural,2043633003.0,,May Pian-Smith,Amanda Xi
25809,,NaT,False,,,,,
112765,3401982175.0,2021-12-07 07:03:00+00:00,False,epidural,6584978808.0,,,
61838,3169469877.0,2017-10-31 22:56:00+00:00,False,,1645760340.0,,,Stefanie Navarro
53732,3484845315.0,2023-01-20 19:44:00+00:00,False,epidural,8788881053.0,,Vesela Kovacheva,Jasmine Robinson
89956,3421542352.0,2022-03-17 07:04:00+00:00,False,epidural,7111714177.0,,David Hepner,Sarah Osmulski
54671,3250159925.0,2019-07-06 12:17:00+00:00,False,spinal,2446743360.0,,Eric Cappiello,Joseph Cerasuolo


In [12]:
df[df['anes_procedure_encounter_id_2273'] == '3191160118']

Unnamed: 0,id,epic_pmrn,delivery_date,delivery_time,pregnancy_outcome,newborn_gestational_age,gestational_age_2052,anesthesia_epidural_2062,anesthesia_general_2063,anesthesia_local_2064,...,has_back_problems,maternal_race,composite_psychosocial_problems,any_public_insurance,maternal_language_english,marital_status_married_or_partner,country_of_origin_USA,employment_status_fulltime,epidural_needle_type,paresthesias_present
33948,31235,10122469949,2018-04-26,21:07:14-05:00,term,287.0,287.0,True,False,False,...,False,Black,False,True,True,False,True,False,other,False
33949,31235,10122469949,2018-04-26,21:07:14-05:00,term,287.0,287.0,True,False,False,...,False,Black,False,True,True,False,True,False,weiss,False
33947,31235,10122469949,2018-04-26,21:07:14-05:00,term,287.0,287.0,True,False,False,...,False,Black,False,True,True,False,True,False,tuohy,False


In [22]:
# Filter the DataFrame for failed catheters and delivery location 'mgh'
random_failed_catheters_mgh = df[(df['failed_catheter'] == True) & (df['delivery_site_2188'] == 'mgh')].sample(n=10, random_state=42)

# Display the chosen sample
random_failed_catheters_mgh[column_names]
chosen_failed_catheters_mgh_encounter_ids = ["3268447806", "3396191507", "3258959083", "3581696894", "3271964781", "3583787789", "3402989492", "3476124055", "3304131417", "3522418740"]
chosen_failed_catheter_mgh_note_ids = ['2903598031', '6426160113', '2535157730', '11282242570',
       '3002237621', '11340428769', '6612736939', '8559605944',
       '3947063203', '9788012155']

In [23]:
df[df['anes_procedure_note_id_2260'].isin(chosen_failed_catheter_mgh_note_ids)][column_names]

Unnamed: 0,anes_procedure_encounter_id_2273,best_timestamp,failed_catheter,true_procedure_type,anes_procedure_note_id_2260,subsequent_proof_of_failure_note_id,Regulated_Anesthesiologist_Name,Regulated_Resident_Name
98644,3258959083,2019-09-03 11:37:00+00:00,True,epidural,2535157730,"['2535640891', '2536024256', '2536182051']",Lisa Leffert,Daewoong Lee
46820,3268447806,2019-11-08 19:51:00+00:00,True,epidural,2903598031,['2905158941'],Rebecca Minehart,
15623,3271964781,2019-11-27 03:50:00+00:00,True,epidural,3002237621,['3002752923'],Kate Cohen,Rupeng Li
64344,3304131417,2020-06-22 08:49:00+00:00,True,epidural,3947063203,['3947625413'],Dan Ellis,Peter Ochieng
46899,3396191507,2021-11-09 10:34:00+00:00,True,epidural,6426160113,['6431936120'],,Yasmin Fatemi
1145,3402989492,2021-12-13 04:15:00+00:00,True,epidural,6612736939,['6612902724'],Emily Naoum,Lukas Matern
76464,3476124055,2022-12-11 08:12:00+00:00,False,,8559605944,,Dan Ellis,Allison Dorogi
76461,3476124055,2022-12-11 08:12:00+00:00,True,epidural,8559605944,"['8561892026', '8566718755']",Dan Ellis,Allison Dorogi
31643,3522418740,2023-07-17 14:22:00+00:00,True,epidural,9788012155,['9789776533'],Hilary Gallin,Ryan Norman
34330,3581696894,2024-03-30 23:01:00+00:00,True,epidural,11282242570,['11282354203'],Gregory Ginsburg,Carl Pierre


# Reduce Table to Chosen Features

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

# Assuming 'df' is your DataFrame (as defined in the provided code)
all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

In [141]:
chosen_features = [
#    "id",
    "gestational_age_2052",
    "delivery_site_2188",
    "baby_weight_2196",
    "rom_thru_delivery_hours",
    "fetal_presentation_category_2243",
    "fetal_presentation_subcategory_2244",
    "fetal_presentation_position_2247",
    "bmi_end_pregnancy_2044",
    "maternal_weight_end_pregnancy_2045",
    "bmi_before_pregnancy_2161",
#    "zipcode_2185",
    "gravidity_2047",
    "parity_2048",
#    "anes_procedure_note_text_2271",
#    "best_timestamp",
    "true_procedure_type",
    "is_neuraxial_catheter",
    "failed_catheter",
    "dpe",
    "lor_depth",
    "current_resident_catheter_count",
    "highly_experienced_anesthesiologist",
    "highly_experienced_resident",
    "current_anesthesiologist_catheter_count",
    "moderately_experienced_anesthesiologist",
    "has_scoliosis",
    "has_dorsalgia",
    "has_back_problems",
    "maternal_race",
 #   "prior_pain_scores",
    "prior_pain_scores_max",
    "composite_psychosocial_problems",
    "any_public_insurance",
    "maternal_language_english",
    "marital_status_married_or_partner",
    "country_of_origin_USA",
    "employment_status_fulltime",
    "epidural_needle_type",
    "paresthesias_present",
]

In [142]:
df = df[chosen_features]

In [1]:
df = df.replace({True: 1, False: 0})

NameError: name 'df' is not defined

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

# Download

In [150]:
df.to_csv('C:\\Users\\dfber\\Desktop\\minimal_merlin_data.csv', index=False)