# Imports

In [None]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
import random
import modules.testing as testing
import modules.data_cleaning_utils as dcu
from importlib import reload

# my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

In [None]:
# # Data from October Merlin pull
# file_path = my_computer_fpath + "e26f9ccc-68a4-42b4-9d0d-508a83026a1c.csv"
# delivery_datetime_is_incorrect = True
# procedure_datetime_is_incorrect = True
# procedure_starttime_is_incorrect = True

# # Data from January Merin pull
# file_path = my_computer_fpath + "ccfaad4e-0523-4fe8-bc87-150370deef90.csv"
# delivery_datetime_is_incorrect = False
# procedure_datetime_is_incorrect = True
# procedure_starttime_is_incorrect = True

# Data from March Merlin pull
file_path = my_computer_fpath + "55670a61-a439-48a2-9f1e-acf1e4156730.csv"
delivery_datetime_is_incorrect = False
procedure_datetime_is_incorrect = True
procedure_starttime_is_incorrect = True


merlin_df = pd.read_csv(file_path)

In [None]:
reload(dcu)
df = merlin_df.copy()
pd.set_option("display.max_columns", None)
df.head()

In [None]:
df.shape

In [None]:
df['delivery_site_2188'].value_counts(dropna=False)

# Initial Data Cleaning

In [None]:
anes_procedure_cols: list = [
    'anes_procedure_type_2253', 
    'anes_procedure_start_dts_2254', 
    'anes_procedure_anesthesiologist_2255', 
    'anes_procedure_resident_2256', 
    'anes_procedure_pt_position_2257', 
    'anes_procedure_approach_2258', 
    'anes_procedure_location_2259', 
    'anes_procedure_note_id_2260', 
    'anes_procedure_dos_dts_2261', 
    'anes_procedure_dpe_2262', 
    'anes_procedure_epidural_needle_2263', 
    'anes_procedure_epidural_needle_gauge_2264', 
    'anes_procedure_lor_depth_2265', 
    'anes_procedure_catheter_depth_2266', 
    'anes_procedure_spinal_needle_type_2267', 
    'anes_procedure_spinal_needle_gauge_2268', 
    'anes_procedure_spinal_needle_length_2269', 
    'anes_procedure_paresthesias_2270', 
    'anes_procedure_note_text_2271',
    'anes_procedure_encounter_id_2273'
]

In [None]:
# Explode |-separated notes
df = dcu.explode_separated_procedure_notes(df, anes_procedure_cols=anes_procedure_cols, delimiter="|")

In [None]:
# Count number of procedures by type
# Note that other procedure types, including Blood Patch but also A-lines,
# nerve blocks, and POCUS orders, are currently parsed by Merlin to NaN
df['anes_procedure_type_2253'].value_counts(dropna=False)

In [None]:
# Bring in RAW info
# This is needed at the moment to get the NotePurposeDSC (to help eliminate near-duplicate notes)
# and also to RegEx the Number of Attempts

raw_info_fpath = my_computer_fpath + "Full Identified raw anesthesia_procedure_notes.csv"
raw_df = pd.read_csv(raw_info_fpath)
df = dcu.add_raw_info(df, raw_info_fpath, processed_note_id_col = 'anes_procedure_note_id_2260', raw_info_cols = ['NotePurposeDSC','NoteTXT'])
df = dcu.regex_note_text(df, desired_col = 'number_of_neuraxial_attempts')

## Handle datetime issues

Bug: Merlin is bringing anes_procedure_dos_dts_2261 as Eastern times when in fact they are UTC. I resolve this by editing the raw strings before conversion to datetime objects.

Bug: The same WAS true for delivery_time before it was corrected in Merlin in January. I resolve this by editing the raw strings before conversion to datetime objects.

Bug: Because delivery_date is stored separately from delivery_time, if the UTC change causes the time to go to the next day, this is NOT reflected in the updated delivery_date. This was also fixed in Merlin in January.

Bug: Merlin ignores AM/PM in anes_procedure_start_dts_2254 and assumes all entries are AM. I resolve this (for now) by ignoring these written start times and just using dos_dts

Validated times:
https://partnershealthcare-my.sharepoint.com/:x:/r/personal/dberenson_bwh_harvard_edu/_layouts/15/Doc.aspx?sourcedoc=%7BD674A3E1-815E-46B8-9AA4-16558C09411A%7D&file=Manually%20Verified%20Catheters.xlsx&action=default&mobileredirect=true&wdOrigin=OUTLOOK-METAOS.FILEBROWSER.FILES-FOLDER

In [None]:
df.loc[df['anes_procedure_note_id_2260'] == '2981389717',['delivery_date','delivery_time','anes_procedure_start_dts_2254','anes_procedure_dos_dts_2261']]

In [None]:
if delivery_datetime_is_incorrect:
    df = dcu.fix_delivery_datetime(df)
else:
    df = dcu.add_delivery_datetime(df)

In [None]:
df['maternal_dob'] = pd.to_datetime(df['maternal_dob_2043'],utc=True)

In [None]:
if procedure_datetime_is_incorrect:
    df = dcu.fix_procedure_dos_datetime(df)
else:
    df['dos_dts'] = pd.to_datetime(df['anes_procedure_dos_dts_2261'])

In [None]:
df['start_dts'] = pd.to_datetime(df['anes_procedure_start_dts_2254'],format='mixed',utc=True)

In [None]:
# Extract the time part of the 'start_dts' column to check whether it covers all 24 h or only 12 h due to AM/PM bug
df[df['start_dts'].notna()]['start_dts'].dt.time.sort_values()

In [None]:
# This code has been changed to avoid the AM/PM bug
if procedure_starttime_is_incorrect:
    df['best_timestamp'] = df['dos_dts']
else:
    df['best_timestamp'] = df['start_dts'].fillna(df['dos_dts'])


In [None]:
anes_procedure_cols.extend(['best_timestamp', 'dos_dts', 'start_dts'])

In [None]:
df.shape

It would be great to look at the title of the anesthesia encounter and eliminate ones other than Labor Epidural or CS, rather than relying on the time narrowing below

In [None]:
df = dcu.calculate_and_narrow_time_from_placement_to_delivery(df)

In [None]:
df.shape

## Handle near-duplicate notes

There is also a column "NotePurposeDSC" in the raw EDW data that can be "ADDENDUM" or "NORMAL" or blank. When there are duplicate notes, the first one will be blank and subsequent ones will be ADDENDUM. I use this fact upstream and delete all the ones that are blank.

Then, I go through and delete other notes that appear to be duplicates. The majority of these are apparently due to TWINS, where a single NoteID appears twice in the dataset due to how Merlin generates it birthwise rather than momwise. However, it CANNOT be done by just eliminating non-unique NoteIDs, as "double-notes" (which appear in Epic as one note that has two procedure descriptions concatenated together) also have the same NoteID. Instead, I drop rows where both the NoteID and the ProcedureType match.

 Instead, I look within each encounter and check if there are two notes that are the same procedure type and within a short minute_offset of each other. If so, I delete the less-complete note.

IMPORTANT: It turns out to be the case that there are sometimes, genuinely in Epic, two procedures done within only a few mins of each other.

In [None]:
df.shape

In [None]:
# Drop near-duplicate notes with blank NotePurposeDSC
df = df.dropna(subset=['NotePurposeDSC'])

In [None]:
df.shape

In [None]:
dcu.print_encounter(df,'3128029077') # double note
dcu.print_encounter(df,'3451276171') # known near-duplicate note (that is genuinely duplicated (actually, triplicated) in Epic)
dcu.print_encounter(df,'3188356337') # known near-duplicate note (that is duplicated due to twins)

In [None]:
# Drop near-duplicate notes with identical Procedure Type and NoteID (i.e., duplicated twins)
df = df.drop_duplicates(subset=['anes_procedure_type_2253','anes_procedure_note_id_2260'])

In [None]:
df.shape

In [None]:
dcu.print_encounter(df,'3128029077') # double note
dcu.print_encounter(df,'3451276171') # known near-duplicate note (that is genuinely duplicated (actually, triplicated) in Epic)
dcu.print_encounter(df,'3188356337') # known near-duplicate note (that is duplicated due to twins)

In [None]:
saved_df = df

In [None]:
reload(dcu)
df = saved_df.copy()

In [None]:
df.shape

In [None]:
df = dcu.label_and_drop_worse_versions_of_duplicates(df, anes_procedure_cols, minute_offset=10, drop=True)

When minute_offset = 60, there are 564 near-duplicates identified.\
When minute_offset = 30, there are 310 near-duplicates identified.\
When minute_offset = 10, there are 175 near-duplicates identified.\
When minute_offset = 2, there are 97 near-duplicates identified.\
When minute_offset = 1, there are 63 near-duplicates identified.\
When minute_offset = 0, there are 12 near-duplicates identified.\

I manually evaluated about twenty. If the minute_offset is 0-10, there are a mix of duplicate notes vs replacements/multiple attempts. If the minute_offset > 10, I found only true replacements (commonly due to positive test dose). Therefore I will use minute_offset = 10.

In [None]:
df.shape

In [None]:
dcu.print_encounter(df,'3128029077') # double note
dcu.print_encounter(df,'3451276171') # known near-duplicate note (that is genuinely duplicated (actually, triplicated) in Epic)
dcu.print_encounter(df,'3188356337') # known near-duplicate note (that is duplicated due to twins)

## Address cases where an epidural note followed by a spinal note is actually a planned CSE, not a failed catheter. Also address what 'epidural/intrathecal' really means.

Secret CSEs are spinal and epidural within 5 mins

Epidural/intrathecal notes are declared epidural unless ***

In [None]:
df = dcu.process_secret_CSEs(df, minute_offset=5)

In [None]:
df = dcu.classify_true_procedure_type(df, intelligent=False)

# Classify failures

In [None]:
df = dcu.label_failed_catheters(df)

# Additional Data Cleaning and Feature Engineering

## Count prior failed neuraxials in this encounter and failed and total across all encounters

Takes ~8 mins

In [None]:
df = dcu.count_prior_catheters(df)

## Handle timeseries data (e.g., pain scores)

In [None]:
df = dcu.handle_pain_scores(df)

In [None]:
df['prior_pain_scores_max'].value_counts(dropna=False)

In [None]:
reload(dcu)

In [None]:
df = dcu.handle_cmi_scores(df)

## Clean DPE and LOR_Depth

In [None]:
df = dcu.handle_dpe(df)

In [None]:
df = dcu.handle_lor_depth(df)

## Make numerical columns numerical

In [None]:
df = dcu.numerify_columns(df, columns_to_convert = ['gestational_age_2052','bmi_end_pregnancy_2044', 'maternal_weight_end_pregnancy_2045', 'maternal_height_2046', 'gravidity_2047', 'parity_2048','baby_weight_2196','bmi_before_pregnancy_2161','secs_rom_thru_delivery_2197'])

## Engineer unexpected_delta_LOR

In [None]:
df = dcu.engineer_unexpected_delta_LOR(df)

## Calculate and plausibilify elapsed times

In [None]:
(df['secs_rom_thru_delivery_2197']/3600).describe(percentiles=[0.05,0.25,0.5,0.75,0.95,0.99,0.999])

In [None]:
df = dcu.convert_elapsed_times(df)

## Handle proceduralist names

In [None]:
df = dcu.handle_anesthesiologists(df)

## Feature engineering on categorical variables

In [None]:
df = dcu.engineer_categorical_variables(df)

## Create a new unique identifier based on epic_pmrn

In [None]:
df = dcu.create_unique_id(df)

# Save processed data prior to analysis

In [None]:
complete_data = df.copy()

In [None]:
# Save the DataFrame to a pickle file
complete_data.to_pickle(my_computer_fpath + "processed_merlin_data.pkl")

In [None]:
# prompt: Import libraries and open CSV

import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
import random
import modules.testing as testing
import modules.data_cleaning_utils as dcu
import modules.data_table_utils as dtu
from importlib import reload

# my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

In [None]:
# Load the pickled DataFrame
complete_data = pd.read_pickle(my_computer_fpath + "processed_merlin_data.pkl")

# Now you can work with the DataFrame
complete_data.head()

In [None]:
df = complete_data.copy()

# Reduce Table to Chosen Features

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

# Assuming 'df' is your DataFrame (as defined in the provided code)
all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

In [None]:
chosen_features = [
    #    "id",
    "unique_pt_id",
    "anes_procedure_encounter_id_2273",
    "is_neuraxial_catheter",
    "failed_catheter",
    "has_subsequent_neuraxial_catheter",
    "has_subsequent_spinal",
    "has_subsequent_airway",
    #    "best_timestamp",
    "placement_to_delivery_hours",
    "rom_thru_delivery_hours",
    "rom_to_placement_hours",
    "maternal_age_years",
    "gravidity_2047",
    "parity_2048",
    "multiple_gestation",
    "labor_induction",
    "gestational_age_weeks",
    "baby_weight_2196",
    "fetal_position_is_posterior_or_transverse",
    "fetal_position",
    "fetal_presentation_is_cephalic",
    "fetal_presentation",
    "bmi_end_pregnancy_2044",
    "bmi_greater_than_40",
    "maternal_weight_end_pregnancy_2045",
    "bmi_before_pregnancy_2161",
    "delivery_site_is_bwh",
    "delivery_site",
    "has_resident",
    "has_anesthesiologist",
    "current_anesthesiologist_catheter_count",
    "current_resident_catheter_count", 
    "total_team_catheter_count",
    "anesthesiologist_experience_category",
    "resident_experience_category",
    "high_bmi_and_highly_experienced_resident",
    "high_bmi_and_lowly_experienced_resident",
    "high_bmi_and_no_resident",
    "high_bmi_and_highly_experienced_anesthesiologist",
    "scoliosis_and_highly_experienced_resident",
    "scoliosis_and_lowly_experienced_resident",
    "scoliosis_and_no_resident",
    "scoliosis_and_highly_experienced_anesthesiologist",
    "high_bmi_and_scoliosis",
    "has_scoliosis",
    "has_dorsalgia",
    "has_back_problems",
    "maternal_race",
    "maternal_ethnicity",
    "prior_ob_cmi_scores_max",
    "CS_hx",
    "high_risk_current_pregnancy",
    "high_risk_hx",
    "iufd",
    "composite_psychosocial_problems",
    "only_private_insurance",
    "maternal_language_english",
    "marital_status_married_or_partner",
    "country_of_origin_USA",
    "employment_status_fulltime",
    "composite_SES_advantage",
    #    "anes_procedure_note_text_2271",
    #    "true_procedure_type",
    #    "dpe",
    "true_procedure_type_incl_dpe",
    "lor_depth",
    "predicted_lor_depth",
    "unexpected_delta_lor",
    "unexpected_delta_lor_squared",
    "epidural_needle_type",
    "prior_pain_scores_max",
    "paresthesias_present",
    "number_of_neuraxial_attempts",
    "prior_failed_catheters_this_enc",
    "prior_failed_catheters_prev_enc",
    "prior_all_catheters_all_enc",
]

In [None]:
print("Excluded Columns:")
for i, col in enumerate(col for col in all_columns if col not in chosen_features):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

In [None]:
df = df[chosen_features]

In [None]:
df = df.replace({True: 1, False: 0})

In [None]:
# prompt: print all columns as a list and make it easy to read over multiple lines

all_columns = df.columns.tolist()

# Print the list of columns, formatted for readability
print("Columns of the DataFrame:")
for i, col in enumerate(all_columns):
    print(f"{i+1}. {col} ||| {df[col].dtype}")

# Download

In [None]:
df.to_csv(my_computer_fpath + 'processed_merlin_data.csv', index=False)

# Limit to neuraxial catheters

In [None]:
df.shape

In [None]:
df = df.loc[df['is_neuraxial_catheter'] == 1,:]
df.drop(columns=['is_neuraxial_catheter'], inplace=True)

In [None]:
df.shape

# Create Table One

In [None]:
dtu.describe_dataframe(df)

In [None]:
cat_table, num_table = dtu.describe_as_tables(df)
table_one = dtu.create_table_one(cat_table, num_table)

In [None]:
failures_cat_table,failures_num_table = dtu.describe_as_tables(df[df['failed_catheter'] == 1])
successes_cat_table,succeses_num_table = dtu.describe_as_tables(df[df['failed_catheter'] == 0])
failures_table_one = dtu.create_table_one(failures_cat_table, failures_num_table)
successes_table_one = dtu.create_table_one(successes_cat_table, succeses_num_table)
table_one_by_failure_status = successes_table_one.merge(failures_table_one, on='Variable', suffixes=('_success', '_failure'))
table_one_by_failure_status = (
    table_one
    .merge(failures_table_one, on='Variable', suffixes=('', '_failures'), how='left')
    .merge(successes_table_one, on='Variable', suffixes=('', '_successes'), how='left')
)


# Noteworthy columns

## Identifier columns

In [None]:
identifier_cols = ['anes_procedure_encounter_id_2273','unique_pt_id']

## Outcome column

In [None]:
outcome_col = 'failed_catheter'

## Highly correlated columns

Depending on the algorithm used, it may be wise to drop these prior to regression.


In [None]:
correlated_cols = [
    'current_anesthesiologist_catheter_count', # correlated with categorical experience variables
    'current_resident_catheter_count', # correlated with categorical experience variables
    'gravidity_2047', # correlated with parity
    'bmi_before_pregnancy_2161', # correlated with BMI end pregnancy
    'maternal_weight_end_pregnancy_2045', # correlated with BMI end pregnancy
    "only_private_insurance", # correlated with composite_SES_advantage
    "maternal_language_english", # correlated with composite_SES_advantage
    "marital_status_married_or_partner", # correlated with composite_SES_advantage
    "country_of_origin_USA", # correlated with composite_SES_advantage
    "employment_status_fulltime", # correlated with composite_SES_advantage
    'epidural_needle_type', # correlated with delivery location
    'maternal_ethnicity', # correlated with race
    "delivery_site", # correlated with delivery_site_bwh,
    "fetal_presentation_position_2247", # correlated with position_posterior_or_transverse
    "fetal_presentation_category_2243" # correlated with presentation_cephalic
    ]

## Non-predictive columns

In [None]:
non_predictive_columns = ['maternal_race','has_scoliosis','composite_SES_advantage']

## Data leakage columns

In [None]:
data_leakage_columns = ['rom_thru_delivery_hours','placement_to_delivery_hours']

# Finalize data

## Impute missing data

In [None]:
from sklearn.impute import SimpleImputer

# For numeric columns, impute with the median
num_imputer = SimpleImputer(strategy='median')

# Identify numeric columns
numeric_cols = df.select_dtypes(include=['float64']).columns

# Fit and transform the numeric columns
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])

# For categorical columns, impute with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')

# Identify categorical columns (adjust if your dtypes are different)
categorical_cols = df.select_dtypes(include=['object']).columns

# Fit and transform the categorical columns
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Note that Boolean columns, which are converted to int64, are not imputed because they have no missing values

# Download Imputed Data

In [None]:
df.to_csv(my_computer_fpath + 'processed_and_imputed_merlin_data.csv', index=False)