# Imports

In [1]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
import random
import modules.testing as testing
import modules.data_cleaning_utils as dcu
from importlib import reload

# my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

In [None]:
# Data from October Merlin pull
file_path = my_computer_fpath + "e26f9ccc-68a4-42b4-9d0d-508a83026a1c.csv"
delivery_datetime_is_incorrect = True
procedure_datetime_is_incorrect = True
procedure_starttime_is_incorrect = True

# # Data from January Merin pull
# file_path = my_computer_fpath + "befa2320-c0e0-476c-b66c-7fd2fb90179e.csv"
# delivery_datetime_is_incorrect = False
# procedure_datetime_is_incorrect = True
# procedure_starttime_is_incorrect = True

merlin_df = pd.read_csv(file_path)

In [None]:
reload(dcu)
df = merlin_df.copy()
pd.set_option("display.max_columns", None)
df.head()

In [None]:
df.shape

In [None]:
df['delivery_site_2188'].value_counts(dropna=False)

# Initial Data Cleaning

In [6]:
anes_procedure_cols: list = [
    'anes_procedure_type_2253', 
    'anes_procedure_start_dts_2254', 
    'anes_procedure_anesthesiologist_2255', 
    'anes_procedure_resident_2256', 
    'anes_procedure_pt_position_2257', 
    'anes_procedure_approach_2258', 
    'anes_procedure_location_2259', 
    'anes_procedure_note_id_2260', 
    'anes_procedure_dos_dts_2261', 
    'anes_procedure_dpe_2262', 
    'anes_procedure_epidural_needle_2263', 
    'anes_procedure_epidural_needle_gauge_2264', 
    'anes_procedure_lor_depth_2265', 
    'anes_procedure_catheter_depth_2266', 
    'anes_procedure_spinal_needle_type_2267', 
    'anes_procedure_spinal_needle_gauge_2268', 
    'anes_procedure_spinal_needle_length_2269', 
    'anes_procedure_paresthesias_2270', 
    'anes_procedure_note_text_2271',
    'anes_procedure_encounter_id_2273'
]

In [7]:
# Explode |-separated notes
df = dcu.explode_separated_procedure_notes(df, anes_procedure_cols=anes_procedure_cols, delimiter="|")

In [None]:
# Count number of procedures by type
# Note that other procedure types, including Blood Patch but also A-lines,
# nerve blocks, and POCUS orders, are currently parsed by Merlin to NaN
df['anes_procedure_type_2253'].value_counts(dropna=False)

In [9]:
# Bring in RAW info
# This is needed at the moment to get the NotePurposeDSC (to help eliminate near-duplicate notes)
# and also to RegEx the Number of Attempts

raw_info_fpath = my_computer_fpath + "Full Identified raw anesthesia_procedure_notes.csv"
raw_df = pd.read_csv(raw_info_fpath)
df = dcu.add_raw_info(df, raw_info_fpath, processed_note_id_col = 'anes_procedure_note_id_2260', raw_info_cols = ['NotePurposeDSC','NoteTXT'])
df = dcu.regex_note_text(df, desired_col = 'number_of_neuraxial_attempts')

## Handle datetime issues

Bug: Merlin is bringing anes_procedure_dos_dts_2261 as Eastern times when in fact they are UTC. I resolve this by editing the raw strings before conversion to datetime objects.

Bug: The same WAS true for delivery_time before it was corrected in Merlin in January. I resolve this by editing the raw strings before conversion to datetime objects.

Bug: Because delivery_date is stored separately from delivery_time, if the UTC change causes the time to go to the next day, this is NOT reflected in the updated delivery_date. This was also fixed in Merlin in January.

Bug: Merlin ignores AM/PM in anes_procedure_start_dts_2254 and assumes all entries are AM. I resolve this (for now) by ignoring these written start times and just using dos_dts

Validated times:
https://partnershealthcare-my.sharepoint.com/:x:/r/personal/dberenson_bwh_harvard_edu/_layouts/15/Doc.aspx?sourcedoc=%7BD674A3E1-815E-46B8-9AA4-16558C09411A%7D&file=Manually%20Verified%20Catheters.xlsx&action=default&mobileredirect=true&wdOrigin=OUTLOOK-METAOS.FILEBROWSER.FILES-FOLDER

In [None]:
df.loc[df['anes_procedure_note_id_2260'] == '2981389717',['delivery_date','delivery_time','anes_procedure_start_dts_2254','anes_procedure_dos_dts_2261']]

In [11]:
if delivery_datetime_is_incorrect:
    df = dcu.fix_delivery_datetime(df)
else:
    df = dcu.add_delivery_datetime(df)

In [12]:
df['maternal_dob'] = pd.to_datetime(df['maternal_dob_2043'],utc=True)

In [13]:
if procedure_datetime_is_incorrect:
    df = dcu.fix_procedure_dos_datetime(df)
else:
    df['dos_dts'] = pd.to_datetime(df['anes_procedure_dos_dts_2261'])

In [14]:
df['start_dts'] = pd.to_datetime(df['anes_procedure_start_dts_2254'],format='mixed',utc=True)

In [None]:
# Extract the time part of the 'start_dts' column to check whether it covers all 24 h or only 12 h due to AM/PM bug
df[df['start_dts'].notna()]['start_dts'].dt.time.sort_values()

In [16]:
# This code has been changed to avoid the AM/PM bug
if procedure_starttime_is_incorrect:
    df['best_timestamp'] = df['dos_dts']
else:
    df['best_timestamp'] = df['start_dts'].fillna(df['dos_dts'])


In [17]:
anes_procedure_cols.extend(['best_timestamp', 'dos_dts', 'start_dts'])

## Handle near-duplicate notes

There is also a column "NotePurposeDSC" in the raw EDW data that can be "ADDENDUM" or "NORMAL" or blank. When there are duplicate notes, the first one will be blank and subsequent ones will be ADDENDUM. I use this fact upstream and delete all the ones that are blank.

Then, I go through and delete other notes that appear to be duplicates. The majority of these are apparently due to TWINS, where a single NoteID appears twice in the dataset due to how Merlin generates it birthwise rather than momwise. However, it CANNOT be done by just eliminating non-unique NoteIDs, as "double-notes" (which appear in Epic as one note that has two procedure descriptions concatenated together) also have the same NoteID. Instead, I drop rows where both the NoteID and the ProcedureType match.

 Instead, I look within each encounter and check if there are two notes that are the same procedure type and within a short minute_offset of each other. If so, I delete the less-complete note.

IMPORTANT: It turns out to be the case that there are sometimes, genuinely in Epic, two procedures done within only a few mins of each other. I AM CURRENTLY WORKING ON FIGURING OUT WHAT IS GOING ON WITH THESE.

In [None]:
df.shape

In [19]:
# Drop near-duplicate notes with blank NotePurposeDSC
df = df.dropna(subset=['NotePurposeDSC'])

In [None]:
df.shape

In [None]:
dcu.print_encounter(df,'3128029077') # double note
dcu.print_encounter(df,'3451276171') # known near-duplicate note (that is genuinely duplicated (actually, triplicated) in Epic)
dcu.print_encounter(df,'3188356337') # known near-duplicate note (that is duplicated due to twins)

In [22]:
# Drop near-duplicate notes with identical Procedure Type and NoteID (i.e., duplicated twins)
df = df.drop_duplicates(subset=['anes_procedure_type_2253','anes_procedure_note_id_2260'])

In [None]:
df.shape

In [None]:
dcu.print_encounter(df,'3128029077') # double note
dcu.print_encounter(df,'3451276171') # known near-duplicate note (that is genuinely duplicated (actually, triplicated) in Epic)
dcu.print_encounter(df,'3188356337') # known near-duplicate note (that is duplicated due to twins)

In [25]:
saved_df = df

In [26]:
reload(dcu)
df = saved_df.copy()

In [None]:
df.shape

In [None]:
df = dcu.label_and_drop_worse_versions_of_duplicates(df, anes_procedure_cols, minute_offset=10, drop=True)

When minute_offset = 60, there are 564 near-duplicates identified.\
When minute_offset = 30, there are 310 near-duplicates identified.\
When minute_offset = 10, there are 175 near-duplicates identified.\
When minute_offset = 2, there are 97 near-duplicates identified.\
When minute_offset = 1, there are 63 near-duplicates identified.\
When minute_offset = 0, there are 12 near-duplicates identified.\

I manually evaluated about twenty. If the minute_offset is 0-10, there are a mix of duplicate notes vs replacements/multiple attempts. If the minute_offset > 10, I found only true replacements (commonly due to positive test dose). Therefore I will use minute_offset = 10.

In [None]:
df.shape

In [None]:
dcu.print_encounter(df,'3128029077') # double note
dcu.print_encounter(df,'3451276171') # known near-duplicate note (that is genuinely duplicated (actually, triplicated) in Epic)
dcu.print_encounter(df,'3188356337') # known near-duplicate note (that is duplicated due to twins)

## Address cases where an epidural note followed by a spinal note is actually a planned CSE, not a failed catheter. Also address what 'epidural/intrathecal' really means.

Secret CSEs are spinal and epidural within 5 mins

Epidural/intrathecal notes are declared epidural unless ***

In [31]:
saved_df = df

In [32]:
reload(dcu)
df = saved_df.copy()

In [None]:
df = dcu.process_secret_CSEs(df, minute_offset=5)

In [34]:
saved_df = df

In [35]:
reload(dcu)
df = saved_df.copy()

In [36]:
df = dcu.classify_true_procedure_type(df, intelligent=False)

# Classify failures

In [37]:
saved_df = df

In [53]:
reload(dcu)
df = saved_df.copy()

In [None]:
df = dcu.label_failed_catheters(df)

# Additional Data Cleaning and Feature Engineering

## Count prior failed neuraxials in this encounter and failed and total across all encounters

Takes ~8 mins

In [40]:
df = dcu.count_prior_catheters(df)

## Handle timeseries data (e.g., pain scores)

In [None]:
df = dcu.handle_pain_scores(df)

## Clean DPE and LOR_Depth

In [46]:
df = dcu.handle_dpe(df)

In [47]:
df = dcu.handle_lor_depth(df)

## Make numerical columns numerical

In [None]:
df = dcu.numerify_columns(df, columns_to_convert = ['gestational_age_2052','bmi_end_pregnancy_2044', 'maternal_weight_end_pregnancy_2045', 'maternal_height_2046', 'gravidity_2047', 'parity_2048','baby_weight_2196','bmi_before_pregnancy_2161','secs_rom_thru_delivery_2197'])

## Calculate and plausibilify elapsed times

In [52]:
df = dcu.handle_elapsed_times(df)

Include other limits on plausible data for each feature

## Handle proceduralist names

In [None]:
df = dcu.handle_anesthesiologists(df)

## Feature engineering on categorical variables

In [None]:
df = dcu.engineer_categorical_variables(df)

## Create a new unique identifier based on epic_pmrn

In [None]:
df = create_unique_id(df)

# Save processed data prior to analysis

In [137]:
complete_data = df.copy()

In [138]:
# Save the DataFrame to a pickle file
complete_data.to_pickle(my_computer_fpath + "processed_merlin_data.pkl")

In [139]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
import random


In [None]:
# Load the pickled DataFrame
complete_data = pd.read_pickle(my_computer_fpath + "processed_merlin_data.pkl")

# Now you can work with the DataFrame
complete_data.head()

In [67]:
df = complete_data.copy()

# Reduce Table to Chosen Features

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

# Assuming 'df' is your DataFrame (as defined in the provided code)
all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

In [69]:
chosen_features = [
#    "id",
    "unique_pt_id",
    "anes_procedure_encounter_id_2273",
    "gestational_age_weeks",
    "delivery_site",
    "delivery_site_bwh",
    "baby_weight_2196",
    "rom_thru_delivery_hours",
    "fetal_presentation_category_2243",
    "fetal_presentation_position_2247",
    "bmi_end_pregnancy_2044",
    "maternal_weight_end_pregnancy_2045",
    "bmi_before_pregnancy_2161",
#    "zipcode_2185",
    "gravidity_2047",
    "parity_2048",
#    "anes_procedure_note_text_2271",
#    "best_timestamp",
#    "true_procedure_type",
    "is_neuraxial_catheter",
    "failed_catheter",
    "has_subsequent_neuraxial_catheter",
    "has_subsequent_spinal",
    "has_subsequent_airway",
#    "dpe",
    "lor_depth",
    "current_resident_catheter_count",
    "highly_experienced_anesthesiologist",
    "highly_experienced_resident",
    "current_anesthesiologist_catheter_count",
    "moderately_experienced_anesthesiologist",
    "has_scoliosis",
    "has_dorsalgia",
    "has_back_problems",
    "maternal_race",
    "maternal_ethnicity",
 #   "prior_pain_scores",
    "prior_pain_scores_max",
    "composite_psychosocial_problems",
    "only_private_insurance",
    "maternal_language_english",
    "marital_status_married_or_partner",
    "country_of_origin_USA",
    "employment_status_fulltime",
    "composite_SES_advantage",
    "epidural_needle_type",
    "paresthesias_present",
    "number_of_neuraxial_attempts",
    "prior_failed_catheters_this_enc",
    "prior_failed_catheters_prev_enc",
    "prior_all_catheters_all_enc",
    "true_procedure_type_incl_dpe",
    "maternal_age_years",
    "placement_to_delivery_hours",
    "labor_induction",
    "position_posterior_or_transverse",
    "presentation_cephalic"
]

In [70]:
df = df[chosen_features]

In [None]:
df = df.replace({True: 1, False: 0})

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

# Download

In [73]:
df.to_csv(my_computer_fpath + 'processed_merlin_data.csv', index=False)