In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

# my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

# Load Data

In [63]:
df = pd.read_csv(my_computer_fpath + 'minimal_merlin_data.csv') 

In [64]:
df = df.replace({True: 1, False: 0})

In [65]:
# Prepend '#' to the most common entry in each categorical column so this will be the one dropped first alphabetically when dummies are made
for column in df.select_dtypes(include='object').columns:
    if column == 'anes_procedure_encounter_id_2273' or column == 'unique_pt_id':
        continue
    most_common = df[column].mode()[0]  # Find the most common entry
    df[column] = df[column].apply(lambda x: f'#{x}' if x == most_common else x)

In [66]:
# Reorder columns to make 'failed_catheter' the first column
cols = ['failed_catheter'] + [col for col in df.columns if col != 'failed_catheter']
df = df[cols]

In [67]:
# Filter the DataFrame to include only neuraxial catheter (ie, epidural + CSE + intrathecal) or epidural-only catheter procedures
neuraxial_catheter_df = df[df['is_neuraxial_catheter'] == 1].drop('is_neuraxial_catheter',axis=1)
epidural_df = df[(df['true_procedure_type_incl_dpe'] == '#epidural') | (df['true_procedure_type_incl_dpe'] == 'dpe')].drop('is_neuraxial_catheter',axis=1)

# Correlation Matrix

In [None]:
# Assume neuraxial_catheter_df is already defined.
# For example:
# neuraxial_catheter_df = pd.read_csv('your_data.csv')

# Identify categorical columns (assuming columns with dtype 'object' or 'category')
categorical_cols = neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1).select_dtypes(include=['object', 'category']).columns

# Create dummy variables for all identified categorical columns
neuraxial_catheter_dummies = pd.get_dummies(neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1), columns=categorical_cols, drop_first=False)

# Compute the correlation matrix using Pearson correlation by default
correlation_matrix = neuraxial_catheter_dummies.corr()

plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix for neuraxial_catheter_df (with dummies)")
plt.show()


# Delete boring highly colinear columns

In [69]:
list_of_colinear_columns = ['prior_all_catheters_all_enc', # correlated with prior failed catheters
                             'bmi_before_pregnancy_2161', # correlated with BMI end pregnancy
                             ]

neuraxial_catheter_df = neuraxial_catheter_df.drop(list_of_colinear_columns, axis=1,errors='ignore')
epidural_df = epidural_df.drop(list_of_colinear_columns, axis=1,errors='ignore')

# Correlation Matrix 2

In [None]:
# Assume neuraxial_catheter_df is already defined.
# For example:
# neuraxial_catheter_df = pd.read_csv('your_data.csv')

# Identify categorical columns (assuming columns with dtype 'object' or 'category')
categorical_cols = neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1).select_dtypes(include=['object', 'category']).columns

# Create dummy variables for all identified categorical columns
neuraxial_catheter_dummies = pd.get_dummies(neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1), columns=categorical_cols, drop_first=False)

# Compute the correlation matrix using Pearson correlation by default
correlation_matrix = neuraxial_catheter_dummies.corr()

plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix for neuraxial_catheter_df (with dummies)")
plt.show()


# Describe Dataframe

There are 134997 total rows, of which a fraction have NaN true_procedure_type.

Every row receives a value for all Boolean variables: thus if no value is present, they become False. Furthermore, NaN procedures become False is_neuraxial_catheter and failed_catheter.

is_neuraxial_catheter includes epidurals + CSEs + intrathecals

failed_catheter is applied to BOTH neuraxial_catheters (which may be coded True or False for failure) and also to all procedures that are not neuraxial_catheters (will always be coded False).

In [None]:
df.shape

In [None]:
neuraxial_catheter_df.shape

In [None]:
def describe_dataframe(df):
    """
    For each column in df:
      - If dtype is object or int64 or bool, list each unique value and its counts.
      - If dtype is float64, display min, Q1, median, Q3, and max.
      - Otherwise, handle accordingly (datetime, etc.).
    """
    for col in df.columns:
        col_type = df[col].dtype

        print(f"Column: {col}")
        print(f"  Data Type: {col_type}")

        if col == "anes_procedure_encounter_id_2273" or col == "unique_pt_id":
            print(f"  Number unique: {len(df[col].unique())}")

        elif col_type == 'object' or col_type == 'int64' or col_type == 'bool':
            # Show unique values and their counts
            value_counts = df[col].value_counts(dropna=False)
            print("  Value counts:")
            for val, count in value_counts.items():
                print(f"    {val}: {count}")

        elif col_type == 'float64':
            # Show min, Q1 (25%), median (50%), Q3 (75%), and max
            desc = df[col].describe(percentiles=[0.25, 0.5, 0.75])
            na_count = df[col].isna().sum()
            print("  Summary stats:")
            print(f"    NaN:    {na_count}")
            print(f"    Min:    {desc['min']}")
            print(f"    Q1:     {desc['25%']}")
            print(f"    Median: {desc['50%']}")
            print(f"    Q3:     {desc['75%']}")
            print(f"    Max:    {desc['max']}")

        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            # Example handling for datetime columns
            print("  (Datetime column – no numeric summary or value counts shown.)")

        else:
            # Handle any other data types as needed
            print("  (No specific handling implemented for this data type.)")

        print("-" * 50)

describe_dataframe(neuraxial_catheter_df)


## Describe as tables

In [74]:
def describe_as_tables(df):
    # Separate columns by dtype
    categorical_cols = []
    numeric_cols = []

    for col in df.columns:
        if col == "anes_procedure_encounter_id_2273" or col == "unique_pt_id":
            pass
        elif df[col].dtype == 'object' or df[col].dtype == 'int64' or df[col].dtype == 'bool':
            categorical_cols.append(col)
        elif df[col].dtype == 'float64':
            numeric_cols.append(col)
        else:
            # skip or handle datetime, etc. if desired
            pass

    # --- Build table for categorical variables ---
    cat_data = {}
    for col in categorical_cols:
        # Get value counts (including NaN as a separate category)
        vc = df[col].value_counts(dropna=False)
        # Convert value counts to a dict, or a formatted string
        vc_str = ", ".join(f"{val}: {count}" for val, count in vc.items())
        cat_data[col] = {
            'value_counts': vc_str
        }
    cat_df = pd.DataFrame(cat_data).T  # Transpose so rows = columns, col = 'value_counts'

    # --- Build table for numeric variables ---
    num_data = {}
    for col in numeric_cols:
        desc = df[col].describe(percentiles=[0.25, 0.5, 0.75])
        na_count = df[col].isna().sum()
        num_data[col] = {
            'count': desc['count'],
            'count_nan': na_count,
            'min': desc['min'],
            'Q1': desc['25%'],
            'median': desc['50%'],
            'Q3': desc['75%'],
            'max': desc['max']
        }
    num_df = pd.DataFrame(num_data).T  # Transpose so rows = columns

    return cat_df, num_df

cat_table, num_table = describe_as_tables(neuraxial_catheter_df)


In [None]:
cat_table

In [None]:
num_table

## Create Table One

In [None]:
def parse_value_counts_str(value_counts_str):
    """
    Convert a string like:
       "bwh: 44730, mgh: 26549, nwh: 22476, slm: 5680..."
    into a dict, e.g.:
       {"bwh": 44730, "mgh": 26549, "nwh": 22476, "slm": 5680}
    It will ignore trailing '...' and attempt to parse each value as float.
    """
    # Strip and remove trailing ellipsis (if present)
    value_counts_str = value_counts_str.strip()
    if value_counts_str.endswith("..."):
        value_counts_str = value_counts_str[:-3].strip()
    
    out_dict = {}
    # Split by commas
    items = [s.strip() for s in value_counts_str.split(",") if s.strip()]
    for item in items:
        # Split on the first colon only
        parts = item.split(":", 1)
        if len(parts) != 2:
            # If we can't split into exactly "key: value", skip
            continue
        key = parts[0].strip()
        val_str = parts[1].strip()
        # Attempt to parse numeric value
        try:
            val = float(val_str)
        except ValueError:
            # If not parseable, store NaN or skip
            val = float("nan")
        out_dict[key] = val
    return out_dict


def create_table_one(cat_table, num_table):
    """
    cat_table: 
        index = categorical variable names
        column "value_counts" = string describing categories & counts (to be parsed)
    num_table:
        index = numeric variable names
        columns include: ["count", "count_nan", "min", "Q1", "median", "Q3", "max", ...]
    """
    table_rows = []

    # 1) Numeric variables: median [Q1 - Q3]
    for var_name in num_table.index:
        median_val = num_table.loc[var_name, "median"]
        q1 = num_table.loc[var_name, "Q1"]
        q3 = num_table.loc[var_name, "Q3"]

        summary_str = f"{median_val:.2f} [{q1:.2f} - {q3:.2f}]"
        table_rows.append([var_name, summary_str])

    # 2) Categorical variables
    for var_name in cat_table.index:
        # 2a) Parse the "value_counts" string into a dict
        raw_str = cat_table.loc[var_name, "value_counts"]
        value_counts_dict = parse_value_counts_str(raw_str)

        # Compute total (excluding missing if you prefer)
        total_n = sum(value_counts_dict.values())

        # 2b) Check if binary (i.e., keys == {0,1} after parsing)
        keys_set = set(value_counts_dict.keys())
        
        # Convert keys from string->float->int if needed
        # (Because if your raw data had "1: 106750", then key might be "1" (string), or float(1.0).)
        # We can do a quick normalization:
        try:
            int_keys = {int(float(k)) for k in keys_set}
        except:
            int_keys = set()  # In case it fails

        if int_keys == {0, 1} and len(keys_set) == 2:
            # If it's truly binary: single row for the percent of '1'
            # (Need to fetch the count for '1' – might be string or float key)
            # We'll do a small loop to figure out which key is '1'
            n_ones = 0
            for k, v in value_counts_dict.items():
                try:
                    if int(float(k)) == 1:
                        n_ones = v
                        break
                except:
                    pass
            pct_ones = 100.0 * n_ones / total_n if total_n else 0.0
            summary_str = f"{int(n_ones)} ({pct_ones:.2f}%)"  # cast to int if you prefer
            table_rows.append([var_name, summary_str])  # cast to int if you prefer
        
        else:
            # Multi-category: separate row per category
            # Sort the keys in some consistent manner
            # We'll attempt to sort by the natural ordering of strings
            # (Alternatively, sort by numeric if your categories are numeric.)
            sorted_keys = sorted(value_counts_dict.keys(), key=str)
            
            for cat_val in sorted_keys:
                n_cat = value_counts_dict[cat_val]
                pct_cat = 100.0 * n_cat / total_n if total_n else 0.0
                summary_str = f"{int(n_cat)} ({pct_cat:.2f}%)"  # cast to int if you prefer
                row_label = f"{var_name} = {cat_val}"
                table_rows.append([row_label, summary_str])

    # Build final DataFrame
    table_one = pd.DataFrame(table_rows, columns=["Variable", "Summary"])
    return table_one

# Create the table
table_one = create_table_one(cat_table, num_table)
table_one

In [78]:
failures_cat_table,failures_num_table = describe_as_tables(neuraxial_catheter_df[neuraxial_catheter_df['failed_catheter'] == 1])
successes_cat_table,succeses_num_table = describe_as_tables(neuraxial_catheter_df[neuraxial_catheter_df['failed_catheter'] == 0])

In [79]:
failures_table_one = create_table_one(failures_cat_table, failures_num_table)

In [80]:
successes_table_one = create_table_one(successes_cat_table, succeses_num_table)

In [81]:
table_one_by_failure_status = successes_table_one.merge(failures_table_one, on='Variable', suffixes=('_success', '_failure'))

In [82]:
table_one_by_failure_status = (
    table_one
    .merge(failures_table_one, on='Variable', suffixes=('', '_failures'), how='left')
    .merge(successes_table_one, on='Variable', suffixes=('', '_successes'), how='left')
)


In [None]:
table_one_by_failure_status

# Data Visualization

## Procedure Types

In [None]:
# prompt: make a histogram of procedure note types using different colors

# Assuming 'procedure_type' column exists in your DataFrame 'df'
procedure_type_counts = df['true_procedure_type_incl_dpe'].value_counts()

plt.figure(figsize=(6, 6))
plt.bar(procedure_type_counts.index, procedure_type_counts.values, color=['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange'])
plt.xlabel('Procedure Type')
plt.ylabel('Count')
plt.title('Histogram of Procedure Note Types')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Histogram of successes/failures

# Group by procedure type and whether it has subsequent anesthesia
procedure_counts = pd.crosstab(neuraxial_catheter_df['true_procedure_type_incl_dpe'], neuraxial_catheter_df['failed_catheter'])

# Sort the bars in descending order based on the total count of each procedure type
procedure_counts = procedure_counts.sort_values(by=0, ascending=False)

# Create a stacked bar chart
ax = procedure_counts.plot(kind='bar', stacked=True, figsize=(6
, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
  width = p.get_width()
  height = p.get_height()
  x, y = p.get_xy()
  ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Procedure Type')
plt.ylabel('Count')
plt.title('Histogram of Successful/Failed')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()


In [None]:
# Display the table with the same information
print("Table of Neuraxial Catheter Procedures by Success/Failure:")
print(procedure_counts)


## Anesthesiologist Experience

In [None]:
# prompt: Create a similar histogram for failure rate vs highly experienced anesthesiologist

# Group by 'highly_experienced_anesthesiologist' and 'failed_catheter'
experience_failure_counts = pd.crosstab(neuraxial_catheter_df['highly_experienced_anesthesiologist'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Anesthesiologist Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Anesthesiologist Experience')
plt.xticks(rotation=0, ha='center', ticks=[0,1,2], labels=['No Anesthesiologist','Not Highly Experienced', 'Highly Experienced'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Anesthesiologist Experience:")
experience_failure_counts

In [None]:
# prompt: create a similar histogram for failure rate vs moderately experienced anesthesiologist

# Group by 'moderately_experienced_anesthesiologist' and 'failed_catheter'
experience_failure_counts = pd.crosstab(neuraxial_catheter_df['moderately_experienced_anesthesiologist'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Anesthesiologist Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Moderately Experienced Anesthesiologist')
plt.xticks(rotation=0, ha='center', ticks=[0,1,2], labels=['No Anesthesiologist','Not Moderately Experienced', 'Moderately Experienced'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Moderately Experienced Anesthesiologist:")
experience_failure_counts

In [None]:
# prompt: Create a similar histogram for failure rate vs highly experienced resident

# Group by 'highly_experienced_resident' and 'failed_catheter'
experience_failure_counts = pd.crosstab(neuraxial_catheter_df['highly_experienced_resident'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Resident Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Resident Experience')
plt.xticks(rotation=0, ha='center', ticks=[0,1,2], labels=['No Resident','Not Highly Experienced', 'Highly Experienced'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Resident Experience:")
experience_failure_counts

In [None]:
# prompt: Create a similar histogram but look at all combinations of resident and anesthesiologist experience. Make the x-axis labels vertical.

# Group by 'highly_experienced_anesthesiologist', 'highly_experienced_resident', and 'failed_catheter'
experience_failure_counts = pd.crosstab([neuraxial_catheter_df['highly_experienced_anesthesiologist'], neuraxial_catheter_df['highly_experienced_resident']], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(8, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Anesthesiologist and Resident Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Anesthesiologist and Resident Experience')


# Customize x-axis labels
import itertools
anesth_levels = ["Anes=None", "Anes=Not Exp", "Anes=Exp"]
resident_levels = ["Res=None", "Res=Not Exp", "Res=Exp"]
labels = list(itertools.product(anesth_levels, resident_levels))
plt.xticks(rotation=90, ha='center', ticks=range(len(labels)), labels=labels)

plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Anesthesiologist and Resident Experience:")
experience_failure_counts

In [None]:
# prompt: crosstab resident experience by BMI and make violin plots

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'df' is your DataFrame (as defined in the provided code)
# and it contains columns 'bmi_end_pregnancy_2044' and 'resident_experience' (or a similar column)

# Create the cross-tabulation
crosstab_data = pd.crosstab(neuraxial_catheter_df['bmi_end_pregnancy_2044'], neuraxial_catheter_df['highly_experienced_resident'])

# Display the cross-tabulation
print("Crosstab of Resident Experience by BMI:")
print(crosstab_data)

# Create violin plots
plt.figure(figsize=(10, 6))
sns.violinplot(x='highly_experienced_resident', y='bmi_end_pregnancy_2044', data=df)
plt.xlabel('Resident Experience')  # Customize the x-axis label
plt.ylabel('BMI') # Customize the y-axis label
plt.title('Violin Plot of BMI by Resident Experience')
plt.show()

## Delivery Site

In [None]:
# prompt: create a similar histogram of delivery_site using crosstab

# Create a crosstab for 'delivery_site' and visualize it as a histogram
delivery_site_counts = pd.crosstab(neuraxial_catheter_df['delivery_site'], neuraxial_catheter_df['failed_catheter'])

# Sort the bars in descending order based on the total count of each delivery site
delivery_site_counts = delivery_site_counts.sort_values(by=0, ascending=False)

# Create a stacked bar chart
ax = delivery_site_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Delivery Site')
plt.ylabel('Count')
plt.title('Histogram of Delivery Site by Success/Failure')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Delivery Site by Success/Failure:")
delivery_site_counts

## DPE

In [None]:
# prompt: create a pie chart of the fraction of DPE in epidural_df

# Count DPE values, treating NaN and '' as "no"
dpe_counts = epidural_df['true_procedure_type_incl_dpe'].value_counts()

# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(dpe_counts, labels=dpe_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Fraction of DPE in Epidural Procedures')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# prompt: reproduce the above histogram using crosstab on delivery_site and dpe

# Assuming 'df' is your DataFrame (as defined in the provided code)

# Create a crosstab for 'delivery_site' and 'dpe' and visualize it as a histogram
delivery_site_dpe_counts = pd.crosstab(epidural_df['delivery_site'], (epidural_df['true_procedure_type_incl_dpe'] == 'dpe').astype(int))

# Sort the bars in descending order based on the total count of each delivery site
delivery_site_dpe_counts = delivery_site_dpe_counts.sort_values(by=1, ascending=False) # Sort by 'no'

# Create a stacked bar chart
ax = delivery_site_dpe_counts.plot(kind='bar', stacked=True, figsize=(10, 6))

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Delivery Site')
plt.ylabel('Count')
plt.title('Histogram of Delivery Site by DPE')
plt.xticks(rotation=45, ha='right')
plt.legend(['DPE: no', 'DPE: yes']) # Update legend labels
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Delivery Site by DPE:")
delivery_site_dpe_counts

In [None]:
# Histogram of successes/failures by DPE status

# Group by procedure type and whether it has subsequent anesthesia
dpe_crosstab = pd.crosstab(epidural_df['true_procedure_type_incl_dpe'] == 'dpe', epidural_df['failed_catheter'])

# Create a stacked bar chart
ax = dpe_crosstab.plot(kind='bar', stacked=True, figsize=(6
, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
  width = p.get_width()
  height = p.get_height()
  x, y = p.get_xy()
  ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('DPE Status')
plt.ylabel('Count')
plt.title('Histogram of Successful/Failed')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()


In [None]:
# prompt: do a crosstab histogram of failure versus delivery_site and dpe

# Assuming 'df' is your DataFrame (as defined in the provided code)

# Create a crosstab for 'delivery_site', 'dpe', and 'failed_catheter'
crosstab_df = pd.crosstab([neuraxial_catheter_df['delivery_site'], neuraxial_catheter_df['true_procedure_type_incl_dpe'] == 'dpe'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = crosstab_df.plot(kind='bar', stacked=True, figsize=(12, 6))

# Annotate the bars with percentages
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')


plt.xlabel('Delivery Site and DPE')
plt.ylabel('Count')
plt.title('Crosstab Histogram: Failure vs. Delivery Site and DPE')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the crosstab table
print("Crosstab Table:")
crosstab_df

## Scoliosis and back problems

In [None]:
# prompt: create a histogram of the crosstab of has_scoliosis vs failure_rate

# Assuming 'neuraxial_catheter_df' is your DataFrame (as defined in the provided code)

# Group by 'has_scoliosis' and 'failed_catheter'
scoliosis_failure_counts = pd.crosstab(neuraxial_catheter_df['has_scoliosis'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = scoliosis_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Has Scoliosis')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Scoliosis')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Scoliosis', 'Scoliosis'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Scoliosis:")
scoliosis_failure_counts

In [None]:
# prompt: do the same but for has_back_problems

# Group by 'has_back_problems' and 'failed_catheter'
back_problems_failure_counts = pd.crosstab(neuraxial_catheter_df['has_back_problems'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = back_problems_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Has Back Problems')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Back Problems')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Back Problems', 'Back Problems'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Back Problems:")
back_problems_failure_counts

In [None]:
# prompt: do the same but for has_dorsalgia

# Group by 'has_dorsalgia' and 'failed_catheter'
back_pain_failure_counts = pd.crosstab(neuraxial_catheter_df['has_dorsalgia'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = back_pain_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Has Back Pain')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Back Pain')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Back Pain', 'Back Pain'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Back Pain:")
back_pain_failure_counts

## Fetal Presentation

In [None]:
# prompt: do the same histogram, but for fetal_presentation_category vs failure

# Group by 'fetal_presentation_category_2243' and 'failed_catheter'
fetal_presentation_failure_counts = pd.crosstab(neuraxial_catheter_df['fetal_presentation_category_2243'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = fetal_presentation_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Fetal Presentation Category')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Fetal Presentation Category')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Fetal Presentation Category:")
fetal_presentation_failure_counts

In [None]:
# prompt: do the same histogram, but for fetal_presentation_position vs failure

# Assuming 'neuraxial_catheter_df' is your DataFrame (as defined in the provided code)

# Group by 'fetal_presentation_position_2247' and 'failed_catheter'
fetal_position_failure_counts = pd.crosstab(neuraxial_catheter_df['fetal_presentation_position_2247'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = fetal_position_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Fetal Presentation Position')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Fetal Presentation Position')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Fetal Presentation Position:")
fetal_position_failure_counts

## Race and SES

In [None]:
# prompt: do the same histogram, but for maternal_race vs failure

# Group by 'maternal_race' and 'failed_catheter'
race_failure_counts = pd.crosstab(neuraxial_catheter_df['maternal_race'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = race_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Maternal Race')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Maternal Race')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Maternal Race:")
race_failure_counts

In [None]:
# prompt: do the same histogram, but for each of these:
# 32. composite_psychosocial_problems ||| int64
# 33. only_private_insurance ||| int64
# 34. maternal_language_english ||| int64
# 35. marital_status_married_or_partner ||| int64
# 36. country_of_origin_USA ||| int64
# 37. employment_status_fulltime ||| int64
# 38. composite_SES_advantage ||| int64

# Assuming 'neuraxial_catheter_df' is your DataFrame

columns_to_analyze = [
    'composite_psychosocial_problems',
    'only_private_insurance',
    'maternal_language_english',
    'marital_status_married_or_partner',
    'country_of_origin_USA',
    'employment_status_fulltime',
    'composite_SES_advantage'
]

for column in columns_to_analyze:
  # Group by the current column and 'failed_catheter'
  failure_counts = pd.crosstab(neuraxial_catheter_df[column], neuraxial_catheter_df['failed_catheter'])

  # Create a stacked bar chart
  ax = failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

  # Add percentages within each bar
  for p in ax.patches:
      width = p.get_width()
      height = p.get_height()
      x, y = p.get_xy()
      ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

  plt.xlabel(column)
  plt.ylabel('Count')
  plt.title(f'Histogram of Failure Rate vs. {column}')

  # Customize x-axis ticks and labels (adjust as needed for each column)
  plt.xticks(rotation=0, ha='center')

  plt.legend(['Successful', 'Failed'])
  plt.tight_layout()
  plt.show()

  # Display the table with the same information
  print(f"Table of Failure Rate vs. {column}:")
failure_counts

## Pain

In [None]:
# prompt: do the same histogram but for prior_pain_scores_max

# Assuming 'neuraxial_catheter_df' is your DataFrame

# Group by 'prior_pain_scores_max' and 'failed_catheter'
prior_pain_failure_counts = pd.crosstab(neuraxial_catheter_df['prior_pain_scores_max'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = prior_pain_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Prior Pain Scores Max')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Prior Pain Scores Max')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels if needed
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Prior Pain Scores Max:")
prior_pain_failure_counts

## Gravidity and Parity

In [None]:
# prompt: do the same histogram but for gravidity_2047 and parity_2048

# Assuming 'neuraxial_catheter_df' is your DataFrame

# Group by 'gravidity_2047' and 'failed_catheter'
gravidity_failure_counts = pd.crosstab(neuraxial_catheter_df['gravidity_2047'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = gravidity_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Gravidity')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Gravidity')
plt.xticks(rotation=0)  # Adjust rotation if needed
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table
print("Table of Failure Rate vs. Gravidity:")
print(gravidity_failure_counts)


# Group by 'parity_2048' and 'failed_catheter'
parity_failure_counts = pd.crosstab(neuraxial_catheter_df['parity_2048'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = parity_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Parity')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Parity')
plt.xticks(rotation=0)  # Adjust rotation if needed
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table
print("Table of Failure Rate vs. Parity:")
parity_failure_counts

## Maternal Age

In [None]:
# Assuming 'maternal_age_years' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['maternal_age_years', 'failed_catheter'])

# Bin the maternal_age_years
df_plot['maternal_age_bin'] = (df_plot['maternal_age_years'] // 1).astype(int)

# Group by the binned maternal_age_years and calculate the mean and standard error of the mean of failed_catheter
failure_by_age = df_plot.groupby('maternal_age_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_age.index, failure_by_age['mean'], marker='o')
plt.fill_between(failure_by_age.index,
                 failure_by_age['mean'] - failure_by_age['sem'],
                 failure_by_age['mean'] + failure_by_age['sem'],
                 alpha=0.5) # Add shaded error bars

plt.xlabel('Maternal Age (years, binned by 1)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Maternal Age (binned by 1) with Error Bars')
plt.grid(True)
plt.show()

## BMI / height / weight

In [None]:
# prompt: plot bmi end pregnancy against failure rate using binning as above.

# Assuming 'bmi_end_pregnancy' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['bmi_end_pregnancy_2044', 'failed_catheter'])

# Bin the bmi_end_pregnancy
df_plot['bmi_end_pregnancy_bin'] = (df_plot['bmi_end_pregnancy_2044'] // 1).astype(int)

# Group by the binned bmi_end_pregnancy and calculate the mean and standard error of the mean of failed_catheter
failure_by_bmi = df_plot.groupby('bmi_end_pregnancy_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_bmi.index, failure_by_bmi['mean'], marker='o')
plt.fill_between(failure_by_bmi.index,
                 failure_by_bmi['mean'] - failure_by_bmi['sem'],
                 failure_by_bmi['mean'] + failure_by_bmi['sem'],
                 alpha=0.5) # Add shaded error bars

plt.xlabel('BMI (kg/m^2) at End of Pregnancy (binned by 1)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. BMI at End of Pregnancy (binned by 1) with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: # prompt: plot weight end pregnancy against failure rate using binning as above.

# Assuming 'maternal_weight_end_pregnancy_2045' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['maternal_weight_end_pregnancy_2045', 'failed_catheter'])

# Bin the maternal weight at the end of pregnancy
df_plot['weight_end_pregnancy_bin'] = (df_plot['maternal_weight_end_pregnancy_2045'] // 10).astype(int) * 10

# Group by the binned weight and calculate the mean and standard error of the mean of failed_catheter
failure_by_weight = df_plot.groupby('weight_end_pregnancy_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_weight.index, failure_by_weight['mean'], marker='o')
plt.fill_between(failure_by_weight.index,
                 failure_by_weight['mean'] - failure_by_weight['sem'],
                 failure_by_weight['mean'] + failure_by_weight['sem'],
                 alpha=0.5)  # Add shaded error bars

plt.xlabel('Maternal Weight (kg) at End of Pregnancy (binned by 10)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Maternal Weight at End of Pregnancy (binned by 10) with Error Bars')
plt.grid(True)
plt.show()

In [109]:
# # prompt: do the same but for height

# # Assuming 'height' is a column in your DataFrame 'df'
# df_plot = neuraxial_catheter_df.dropna(subset=['maternal_height_2046', 'failed_catheter'])

# # Drop heights greater than 250
# df_plot = df_plot[df_plot['maternal_height_2046'] <= 250]

# # Bin the height
# df_plot['height_bin'] = (df_plot['maternal_height_2046'] // 1).astype(int)

# # Group by the binned height and calculate the mean and standard error of the mean of failed_catheter
# failure_by_height = df_plot.groupby('height_bin')['failed_catheter'].agg(['mean', 'sem'])

# # Create the plot
# plt.figure(figsize=(10, 6))
# plt.plot(failure_by_height.index, failure_by_height['mean'], marker='o')
# plt.fill_between(failure_by_height.index,
#                  failure_by_height['mean'] - failure_by_height['sem'],
#                  failure_by_height['mean'] + failure_by_height['sem'],
#                  alpha=0.5) # Add shaded error bars

# plt.xlabel('Height (binned by 1)')
# plt.ylabel('Average Failure Rate')
# plt.title('Failure Rate vs. Height (binned by 1) with Error Bars')
# plt.grid(True)
# plt.show()

## Needle Type

In [None]:
# prompt: do the same histogram but for epidural_needle_type

# Assuming 'neuraxial_catheter_df' is your DataFrame (as defined in the provided code)

# Group by 'epidural_needle_type' and 'failed_catheter'
needle_type_failure_counts = pd.crosstab(neuraxial_catheter_df['epidural_needle_type'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = needle_type_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Epidural Needle Type')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Epidural Needle Type')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Epidural Needle Type:")
needle_type_failure_counts

## Paresthesias

In [None]:
# prompt: do the same histogram but for paresthesias_present

# Group by 'paresthesias_present' and 'failed_catheter'
paresthesias_failure_counts = pd.crosstab(neuraxial_catheter_df['paresthesias_present'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = paresthesias_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Paresthesias Present')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Paresthesias Present')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Paresthesias', 'Paresthesias'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Paresthesias Present:")
paresthesias_failure_counts

## Number of Attempts

In [None]:
# prompt: create a histogram of the number of attempts. Only show integers on the x-axis

# Assuming 'number_of_neuraxial_attempts' is a column in your DataFrame 'df'
attempts_counts = df['number_of_neuraxial_attempts'].value_counts().sort_index()

plt.figure(figsize=(8, 6))
plt.bar(attempts_counts.index, attempts_counts.values)
plt.xlabel('Number of Attempts')
plt.ylabel('Count')
plt.title('Histogram of Number of Neuraxial Attempts')
plt.xticks(range(int(attempts_counts.index.min()), int(attempts_counts.index.max()) + 1))  # Show only integer ticks on x-axis
plt.tight_layout()
plt.show()


## Loss of Resistance Depth

In [None]:
# prompt: create a histogram of loss of resistance depth. Center the bars over the tick marks and make space between the bars. Bins should be every 0.5

# Assuming 'lor_depth' is a column in your DataFrame 'df'
lor_depths = neuraxial_catheter_df['lor_depth'].dropna()  # Remove NaN values

# Create the histogram with centered bars and spacing
plt.figure(figsize=(8, 6))
plt.hist(lor_depths, bins=np.arange(lor_depths.min(), lor_depths.max() + 0.5, 0.5), rwidth=0.8, align='left')
plt.xlabel('Loss of Resistance Depth')
plt.ylabel('Count')
plt.title('Histogram of Loss of Resistance Depth')
plt.xticks(np.arange(0, lor_depths.max() + 0.5, 1))  # Set x-axis ticks to be at every 1
plt.tight_layout()
plt.show()


In [None]:
# prompt: Plot number of neuraxial attempts vs LOR depth on the x-axis. Add jiggle to both x and y axes

df_plot = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'lor_depth'])

# Add random jiggle to both x and y axes
jiggle_x = np.random.normal(scale = 0.1, size=len(df_plot))
jiggle_y = np.random.normal(scale = 0.1, size=len(df_plot))

plt.figure(figsize=(10, 6))
plt.scatter(df_plot['lor_depth'] + jiggle_x, df_plot['number_of_neuraxial_attempts'] + jiggle_y, alpha=0.5)
plt.xlabel('LOR Depth')
plt.ylabel('Number of Neuraxial Attempts')
plt.title('Number of Neuraxial Attempts vs. LOR Depth with Jiggle')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Extract the data, dropping NaNs
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'number_of_neuraxial_attempts'])

# Create a list of unique values in 'number_of_neuraxial_attempts'
attempts = [1, 2, 3, 4]

# Create histograms for each number_of_neuraxial_attempts
plt.figure(figsize=(12, 8))

for i, attempt in enumerate(attempts, start=1):
    # Filter data for each attempt
    subset = df_plot[df_plot['number_of_neuraxial_attempts'] == attempt]
    
    # Plot histogram for 'lor_depth'
    plt.subplot(2, 2, i)
    plt.hist(subset['lor_depth'], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Histogram of LOR Depth for {attempt} Neuraxial Attempt(s)')
    plt.xlabel('LOR Depth')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# prompt: do the same but add shaded error bars for +/- standard error of the mean

# Assuming 'number_of_neuraxial_attempts' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts'])

# Group by number of attempts and calculate the mean and standard error of the mean of failed_catheter
failure_by_attempts = df_plot.groupby('number_of_neuraxial_attempts')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot with error bars
plt.figure(figsize=(10, 6))
plt.plot(failure_by_attempts.index, failure_by_attempts['mean'], marker='o')
plt.fill_between(failure_by_attempts.index,
                 failure_by_attempts['mean'] - failure_by_attempts['sem'],
                 failure_by_attempts['mean'] + failure_by_attempts['sem'],
                 alpha=0.2) # Add shaded error bars
plt.errorbar(failure_by_attempts.index, failure_by_attempts['mean'], yerr=failure_by_attempts['sem'], fmt='o-', capsize=5, elinewidth=1)  # Added error bars
plt.xlabel('Number of Neuraxial Attempts')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Number of Neuraxial Attempts with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: Plot lor-depth against bmi

# Assuming 'lor_depth' and 'bmi_end_pregnancy_2044' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'bmi_end_pregnancy_2044'])

plt.figure(figsize=(10, 6))
plt.scatter(df_plot['bmi_end_pregnancy_2044'], df_plot['lor_depth'])
plt.xlabel('BMI')
plt.ylabel('LOR Depth')
plt.title('LOR Depth vs. BMI')
plt.show()

In [118]:
# from scipy.stats import gaussian_kde

# # Extract the data, dropping NaNs
# df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'bmi_end_pregnancy_2044'])
# x = df_plot['bmi_end_pregnancy_2044'].values
# y = df_plot['lor_depth'].values

# # Perform kernel density estimation
# xy = np.vstack([x, y])
# kde = gaussian_kde(xy)

# # Define grid over data range
# xmin, xmax = x.min() - 1, x.max() + 1
# ymin, ymax = y.min() - 1, y.max() + 1
# X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
# positions = np.vstack([X.ravel(), Y.ravel()])
# Z = np.reshape(kde(positions).T, X.shape)

# # Create the contour plot
# plt.figure(figsize=(10, 6))
# plt.contourf(X, Y, Z, levels=15, cmap='viridis')
# plt.colorbar(label='Density')
# plt.xlabel('BMI')
# plt.ylabel('LOR Depth')
# plt.title('Contour Plot of LOR Depth vs. BMI (KDE)')
# plt.show()


In [None]:
# prompt: do the same but for failure vs loss of resistance depth. Bin the depth by units of 1

# Assuming 'lor_depth' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'failed_catheter'])

# Bin the LOR depth
df_plot['lor_depth_bin'] = (df_plot['lor_depth'] // 1).astype(int)

# Group by the binned LOR depth and calculate the mean of failed_catheter
failure_by_lor_depth = df_plot.groupby('lor_depth_bin')['failed_catheter'].mean()

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_lor_depth.index, failure_by_lor_depth.values, marker='o')
plt.xlabel('Loss of Resistance Depth (binned)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Loss of Resistance Depth (binned by 1)')
plt.xticks(np.arange(0, df_plot['lor_depth'].max() + 0.5, 1))  # Set x-axis ticks to be at every 1
plt.grid(True)
plt.show()

In [None]:
# prompt: Reproduce the same plot, but add shaded error bars for +/- standard error of the mean

# Assuming 'lor_depth' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'failed_catheter'])

# Bin the LOR depth
df_plot['lor_depth_bin'] = (df_plot['lor_depth'] // 1).astype(int)

# Group by the binned LOR depth and calculate the mean and standard error of the mean of failed_catheter
failure_by_lor_depth = df_plot.groupby('lor_depth_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_lor_depth.index, failure_by_lor_depth['mean'], marker='o')
plt.fill_between(failure_by_lor_depth.index,
                 failure_by_lor_depth['mean'] - failure_by_lor_depth['sem'],
                 failure_by_lor_depth['mean'] + failure_by_lor_depth['sem'],
                 alpha=0.5) # Add shaded error bars

plt.xlabel('Loss of Resistance Depth (binned)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Loss of Resistance Depth (binned by 1) with Error Bars')
plt.xticks(np.arange(0, df_plot['lor_depth'].max() + 0.5, 1))  # Set x-axis ticks to be at every 1
plt.grid(True)
plt.show()

In [None]:

# Drop rows with NaNs
df_plot = neuraxial_catheter_df.dropna(
    subset=['lor_depth', 'bmi_end_pregnancy_2044', 'failed_catheter']
)

# Separate the data by failed_catheter category
df_0 = df_plot[df_plot['failed_catheter'] == 0]
df_1 = df_plot[df_plot['failed_catheter'] == 1]

plt.figure(figsize=(10, 6))

# Scatter plot for failed_catheter = 0
plt.scatter(
    df_0['bmi_end_pregnancy_2044'],
    df_0['lor_depth'],
    s=10, 
    alpha=0.7,
    color='blue',
    label='Failed Catheter = 0'
)

# Scatter plot for failed_catheter = 1
plt.scatter(
    df_1['bmi_end_pregnancy_2044'],
    df_1['lor_depth'],
    s=10, 
    alpha=0.7,
    color='orange',
    label='Failed Catheter = 1'
)

# --- Calculate and plot regression line for failed_catheter = 0 ---
p0 = np.polyfit(df_0['bmi_end_pregnancy_2044'], df_0['lor_depth'], deg=1)  # slope, intercept
slope_0, intercept_0 = p0
print(f"For failed_catheter=0, slope = {slope_0:.2f}, intercept = {intercept_0:.2f}")

x_vals_0 = np.linspace(df_0['bmi_end_pregnancy_2044'].min(), df_0['bmi_end_pregnancy_2044'].max(), 100)
y_vals_0 = np.polyval(p0, x_vals_0)
plt.plot(x_vals_0, y_vals_0, color='blue', linewidth=2)

# --- Calculate and plot regression line for failed_catheter = 1 ---
p1 = np.polyfit(df_1['bmi_end_pregnancy_2044'], df_1['lor_depth'], deg=1)
slope_1, intercept_1 = p1
print(f"For failed_catheter=1, slope = {slope_1:.2f}, intercept = {intercept_1:.2f}")

x_vals_1 = np.linspace(df_1['bmi_end_pregnancy_2044'].min(), df_1['bmi_end_pregnancy_2044'].max(), 100)
y_vals_1 = np.polyval(p1, x_vals_1)
plt.plot(x_vals_1, y_vals_1, color='orange', linewidth=2)

# Labels and legend
plt.xlabel('BMI')
plt.ylabel('LOR Depth')
plt.title('LOR Depth vs. BMI')
plt.legend()

plt.show()

## Gestational Age and Weight

In [None]:
# prompt: do the same but for gestational age

# Histogram of gestational age
plt.figure(figsize=(10, 6))
plt.hist(df['gestational_age_weeks'].dropna(), bins=20) # Adjust bins as needed
plt.xlabel('Gestational Age (days)')
plt.ylabel('Count')
plt.title('Distribution of Gestational Age')
plt.show()

# Analyze gestational age in relation to failed catheter
df_plot = neuraxial_catheter_df.dropna(subset=['gestational_age_weeks', 'failed_catheter'])
df_plot['gestational_age_bin'] = (df_plot['gestational_age_weeks'] // 7).astype(int) * 7
failure_by_gestational_age = df_plot.groupby('gestational_age_bin')['failed_catheter'].agg(['mean', 'sem'])

plt.figure(figsize=(10, 6))
plt.plot(failure_by_gestational_age.index, failure_by_gestational_age['mean'], marker='o')
plt.fill_between(failure_by_gestational_age.index,
                failure_by_gestational_age['mean'] - failure_by_gestational_age['sem'],
                failure_by_gestational_age['mean'] + failure_by_gestational_age['sem'],
                alpha=0.5)
plt.xlabel('Gestational Age (days) (binned by 7)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Gestational Age (binned by 7) with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: do the same histogram and binned failure rate but for baby_weight_2196

# Assuming 'baby_weight_2196' is a column in your DataFrame 'df' or 'neuraxial_catheter_df'
plt.figure(figsize=(10, 6))
plt.hist(neuraxial_catheter_df['baby_weight_2196'].dropna(), bins=20)  # Adjust bins as needed
plt.xlabel('Baby Weight (kg)')
plt.ylabel('Count')
plt.title('Histogram of Baby Weight')
plt.show()

# Assuming 'neuraxial_catheter_df' is your DataFrame

df_plot = neuraxial_catheter_df.dropna(subset=['baby_weight_2196', 'failed_catheter'])

# Bin the baby weight
df_plot['baby_weight_bin'] = (df_plot['baby_weight_2196'] // 0.5) * 0.5

# Group by the binned baby weight and calculate the mean and standard error of the mean of failed_catheter
failure_by_baby_weight = df_plot.groupby('baby_weight_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_baby_weight.index, failure_by_baby_weight['mean'], marker='o')
plt.fill_between(failure_by_baby_weight.index,
                 failure_by_baby_weight['mean'] - failure_by_baby_weight['sem'],
                 failure_by_baby_weight['mean'] + failure_by_baby_weight['sem'],
                 alpha=0.5)  # Add shaded error bars

plt.xlabel('Baby Weight (kg) (binned by 0.5)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Baby Weight with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: do the same count histogram but for secs_rom_thru_delivery_2197

# Assuming 'neuraxial_catheter_df' is your DataFrame

# Drop NaN values in 'secs_rom_thru_delivery_2197'
df_plot = neuraxial_catheter_df.dropna(subset=['rom_thru_delivery_hours'])

# Create the histogram
plt.figure(figsize=(10, 6))
plt.hist(df_plot['rom_thru_delivery_hours'], bins=200)  # Adjust bins as needed
plt.xlabel('Hours from ROM to Delivery')
plt.xlim(0,100)
plt.ylabel('Count')
plt.title('Histogram of Hours from ROM to Delivery')
plt.show()

In [None]:
# prompt: do the same binned plot for rom_thru_delivery_hours

# Assuming 'neuraxial_catheter_df' is your DataFrame

df_plot = neuraxial_catheter_df.dropna(subset=['rom_thru_delivery_hours', 'failed_catheter'])

# Bin the rom_thru_delivery_hours
df_plot['rom_thru_delivery_hours_bin'] = (df_plot['rom_thru_delivery_hours'] // 1).astype(int)

# Group by the binned rom_thru_delivery_hours and calculate the mean and standard error of the mean of failed_catheter
failure_by_rom_delivery = df_plot.groupby('rom_thru_delivery_hours_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_rom_delivery.index, failure_by_rom_delivery['mean'], marker='o')
plt.fill_between(failure_by_rom_delivery.index,
                 failure_by_rom_delivery['mean'] - failure_by_rom_delivery['sem'],
                 failure_by_rom_delivery['mean'] + failure_by_rom_delivery['sem'],
                 alpha=0.5)  # Add shaded error bars
plt.xlim(0,100)
plt.xlabel('Hours from ROM to Delivery (binned)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Hours from ROM to Delivery (binned by 1) with Error Bars')
plt.grid(True)
plt.show()

## Prior failed catheters

In [None]:
# Assuming 'prior_failed_catheters' is a column in your DataFrame 'neuraxial_catheter_df'
prior_failed_catheters_counts = neuraxial_catheter_df['prior_failed_catheters_this_enc'].value_counts().sort_index()

plt.figure(figsize=(8, 6))
plt.bar(prior_failed_catheters_counts.index, prior_failed_catheters_counts.values)
plt.xlabel('Number of Prior Failed Catheters')
plt.ylabel('Count')
plt.title('Histogram of Prior Failed Catheters (This Encounter)')
plt.xticks(range(int(prior_failed_catheters_counts.index.min()), int(prior_failed_catheters_counts.index.max()) + 1))  # Show only integer ticks on x-axis
plt.tight_layout()
plt.show()

In [None]:
# Assuming 'prior_failed_catheters' is a column in your DataFrame 'neuraxial_catheter_df'
prior_failed_catheters_counts = neuraxial_catheter_df[neuraxial_catheter_df['failed_catheter'] == 1]['prior_failed_catheters_this_enc'].value_counts().sort_index()

plt.figure(figsize=(8, 6))
plt.bar(prior_failed_catheters_counts.index, prior_failed_catheters_counts.values)
plt.xlabel('Number of Prior Failed Catheters (This Encounter)')
plt.ylabel('Count')
plt.title('Histogram of Prior Failed Catheter Among Failed Cases')
plt.xticks(range(int(prior_failed_catheters_counts.index.min()), int(prior_failed_catheters_counts.index.max()) + 1))  # Show only integer ticks on x-axis
plt.tight_layout()
plt.show()

In [None]:

# Drop rows with NaN in 'prior_failed_catheters'
df_plot = neuraxial_catheter_df.dropna(subset=['prior_failed_catheters_this_enc'])

# Group by 'prior_failed_catheters' and calculate mean + standard error of the mean (sem)
failure_by_prior = df_plot.groupby('prior_failed_catheters_this_enc')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot with error bars
plt.figure(figsize=(10, 6))
plt.plot(failure_by_prior.index, failure_by_prior['mean'], marker='o', label='Mean Failure Rate')

# Shaded region for +/- SEM
plt.fill_between(failure_by_prior.index,
                 failure_by_prior['mean'] - failure_by_prior['sem'],
                 failure_by_prior['mean'] + failure_by_prior['sem'],
                 alpha=0.2, label='+/- SEM')

# Error bars for SEM
plt.errorbar(failure_by_prior.index,
             failure_by_prior['mean'],
             yerr=failure_by_prior['sem'],
             fmt='o-', capsize=5, elinewidth=1, color='C0') 

plt.xlabel('Number of Prior Failed Catheters (This Encounter)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Prior Failed Catheters (with SEM)')
plt.grid(True)
plt.legend()
plt.show()

## Placement to Delivery Time

In [None]:
# Assuming 'placement_to_delivery_hours' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['placement_to_delivery_hours', 'failed_catheter'])

# Bin the placement_to_delivery_hours
df_plot['placement_to_delivery_bin'] = (df_plot['placement_to_delivery_hours'] // 1).astype(int)

# Group by the binned placement_to_delivery_hours and calculate the mean and standard error of the mean of failed_catheter
failure_by_placement_time = df_plot.groupby('placement_to_delivery_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_placement_time.index, failure_by_placement_time['mean'], marker='o')
plt.fill_between(failure_by_placement_time.index,
                 failure_by_placement_time['mean'] - failure_by_placement_time['sem'],
                 failure_by_placement_time['mean'] + failure_by_placement_time['sem'],
                 alpha=0.5) # Add shaded error bars

plt.xlabel('Placement to Delivery Time (hours, binned by 1)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Placement to Delivery Time (binned by 1) with Error Bars')
plt.grid(True)
plt.show()

# Statistical Analysis

## Some individually interesting regressions

In [None]:
df_corr = neuraxial_catheter_df.dropna(subset=['lor_depth', 'number_of_neuraxial_attempts'])

# Fit the model using the formula
model = smf.ols('number_of_neuraxial_attempts ~ lor_depth', data=df_corr).fit()

# Print the summary of the regression results
print(model.summary())


In [None]:
# For categorical variables like DPE and failed_catheter
from scipy.stats import chi2_contingency

dpe_crosstab = pd.crosstab((epidural_df['true_procedure_type_incl_dpe'] == 'dpe').astype(int), epidural_df['failed_catheter'])
chi2, p, _, _ = chi2_contingency(dpe_crosstab)

print(dpe_crosstab.div(dpe_crosstab.sum(axis=1), axis=0) * 100)
print("Chi-squared statistic:", chi2)
print("P-value:", p)

In [None]:
# prompt: Do univariate logistic regression separately using number of attempts and loss of resistance depth to predict failure

import statsmodels.api as sm
import statsmodels.formula.api as smf

# Prepare the data for logistic regression with number of attempts as the predictor
df_logreg_attempts = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'failed_catheter'])
# Fit the logistic regression model
model_attempts = smf.logit('failed_catheter ~ number_of_neuraxial_attempts', data=df_logreg_attempts).fit()

# Print the summary of the model
print(model_attempts.summary())


# Prepare the data for logistic regression with loss of resistance depth as the predictor
df_logreg_lor = neuraxial_catheter_df.dropna(subset=['lor_depth', 'failed_catheter'])
# Fit the logistic regression model
model_lor = smf.logit('failed_catheter ~ lor_depth', data=df_logreg_lor).fit()

# Print the summary of the model
print(model_lor.summary())


In [None]:
# prompt: Now do multivariate analysis using the same two predictors

# Prepare the data for logistic regression with both predictors
df_logreg_multi = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'lor_depth', 'failed_catheter'])

# Fit the logistic regression model with both predictors
model_multi = smf.logit('failed_catheter ~ number_of_neuraxial_attempts + lor_depth', data=df_logreg_multi).fit()

# Print the summary of the model
print(model_multi.summary())


In [None]:
# Prepare the data for logistic regression with prior_failed_catheters_this_enc as the predictor
df_logreg_prior_failed = neuraxial_catheter_df.dropna(subset=['prior_failed_catheters_this_enc', 'failed_catheter'])

# Fit the logistic regression model
model_attempts = smf.logit('failed_catheter ~ prior_failed_catheters_this_enc', data=df_logreg_prior_failed).fit()

# Print the summary of the model
print(model_attempts.summary())

## All univariate regressions

In [135]:

def all_regressions_each_dummy(df, outcome_col='failed_catheter'):
    """
    Fits a univariate logistic regression for each column in df (except outcome_col).
    For numeric columns, you get a single slope term.
    For categorical columns, you get one dummy variable per level (minus the reference).
    Then plots x=coefficient, y=-log10(p-value) for *all* those dummy variables.
    """
    

    results = []

    for col in df.columns:
        # Skip the outcome column
        if col == outcome_col:
            continue

        # Skip encounter_id
        if col == "anes_procedure_encounter_id_2273" or col == "unique_pt_id":
            continue
        
        # Skip datetime or other unsupported types
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            continue
        
        # Subset to non-null rows in outcome & predictor
        temp_df = df[[outcome_col, col]].dropna()
        
        # Skip if not enough variation
        if temp_df[col].nunique() < 2 or temp_df[col].count() < 5:
            continue
        
        # Build formula
        # Wrap in C() if categorical
        if pd.api.types.is_numeric_dtype(temp_df[col]):
            formula = f"{outcome_col} ~ {col}"
        else:
            formula = f"{outcome_col} ~ C({col})"
        
        # Fit the logistic model
        try:
            model = smf.logit(formula, data=temp_df).fit(disp=False)
        except Exception as e:
            print(f"Skipping column '{col}' due to fitting error: {e}")
            continue
        
        # For each parameter (except the Intercept),
        # capture the coefficient and p-value.
        for param_name in model.params.index:
            if param_name == 'Intercept':
                continue
            
            coef = model.params.loc[param_name]
            pval = model.pvalues.loc[param_name]
            
            # You might want to create a cleaner label for the parameter.
            # For categorical variables, param_name will look like 'C(col)[T.level]'
            # We'll store the raw param_name, but you can parse it if you like.

            results.append({
                'column': col,
                'param_name': param_name,
                'coef': coef,
                'pval': pval
            })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    if results_df.empty:
        print("No valid predictors found.")
        return pd.DataFrame()

    # Sort by p-value (optional)
    results_df = results_df.sort_values(by='pval')

    
    return results_df


# Example usage:
results_df = all_regressions_each_dummy(neuraxial_catheter_df, 'failed_catheter')
# This returns a DataFrame with columns: [column, param_name, coef, pval].
# Each level of a categorical predictor will appear as a separate row.

In [136]:

import re

def parse_param_name(param_name):
    """
    Parses a statsmodels parameter name like:
        'C(col)[T.value]'
    and returns the level name 'value'.
    """
    
    # Regex for the typical pattern: C(colName)[T.levelName]
    pattern = r'.*\[T\.(.+)\]'
    match = re.match(pattern, param_name)
    if match:
        level_name = match.group(1)
        return level_name
    # If it doesn't match, assume it's some other type of parameter (e.g., numeric var)
    return ''


In [137]:
results_df['category_variable'] = results_df['param_name'].apply(parse_param_name)

In [None]:
results_df

In [None]:
results_df.shape

In [None]:
results_df[results_df['pval'] < 0.05 / 59].shape

In [None]:
# Remove digits from the graph annotations
def remove_nums(string):
    """
    Removes numbers from a string.
    """
    return ''.join([i for i in string if not i.isdigit()])


# Create plot: coefficient vs -log10(p-value)
fig, ax = plt.subplots(figsize=(8, 6))

offset = 1e-300  # so we don't take log10(0)
x_vals = results_df[results_df['pval'] < 0.9]['coef']
y_vals = -np.log10(results_df[results_df['pval'] < 0.9]['pval'] + offset)

sc = ax.scatter(x_vals, y_vals, color='blue')

# Annotate each point
for i, row in results_df[results_df['pval'] < 0.9].iterrows():
    ax.text(
        row['coef'],
        -np.log10(row['pval'] + offset),
        remove_nums(str(row['column'] + '__' + str(row['category_variable']))),
        fontsize=8,
        ha='left',
        va='bottom'
    )

# Add a reference line for p=0.05
ax.axhline(-np.log10(0.05), color='red', linestyle='--', label='p=0.05')

ax.set_xlabel('Coefficient')
ax.set_ylabel('-log10(p-value)')
ax.set_title(f'Logistic Regressions for Catheter_Failure ~ Each Predictor (All Dummies)')
ax.legend()

plt.tight_layout()
plt.show()


# Correlation Matrix 3

In [None]:
# Assume neuraxial_catheter_df is already defined.
# For example:
# neuraxial_catheter_df = pd.read_csv('your_data.csv')

# Identify categorical columns (assuming columns with dtype 'object' or 'category')
categorical_cols = neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1).select_dtypes(include=['object', 'category']).columns

# Create dummy variables for all identified categorical columns
neuraxial_catheter_dummies = pd.get_dummies(neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1), columns=categorical_cols, drop_first=False)

# Compute the correlation matrix using Pearson correlation by default
correlation_matrix = neuraxial_catheter_dummies.corr()

plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix for neuraxial_catheter_df (with dummies)")
plt.show()


# Delete interesting colinear variables

In [143]:
overlap_columns = ['current_anesthesiologist_catheter_count', # correlated with categorical experience variables
                             'current_resident_catheter_count', # correlated with categorical experience variables
                             'moderately_experienced_anesthesiologist', # correlated with categorical experience variables
                             'gravidity_2047', # correlated witih parity
                             'maternal_weight_end_pregnancy_2045', # correlated with BMI end pregnancy
                              "only_private_insurance", # correlated with composite_SES_advantage
                            "maternal_language_english", # correlated with composite_SES_advantage
                            "marital_status_married_or_partner", # correlated with composite_SES_advantage
                            "country_of_origin_USA", # correlated with composite_SES_advantage
                            "employment_status_fulltime", # correlated with composite_SES_advantage
                               'epidural_needle_type', # correlated with delivery location
                               'maternal_ethnicity', # correlated with race
                              "delivery_site", # correlated with delivery_site_bwh,
                              "fetal_presentation_position_2247", # correlated with position_posterior_or_transverse
                              "fetal_presentation_category_2243" # correlated with presentation_cephalic
                    ]

neuraxial_catheter_df = neuraxial_catheter_df.drop(columns=overlap_columns,axis=1)
epidural_df = epidural_df.drop(columns=overlap_columns,axis=1)

# Correlation Matrix 4

In [None]:
# Assume neuraxial_catheter_df is already defined.
# For example:
# neuraxial_catheter_df = pd.read_csv('your_data.csv')

# Identify categorical columns (assuming columns with dtype 'object' or 'category')
categorical_cols = neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1).select_dtypes(include=['object', 'category']).columns

# Create dummy variables for all identified categorical columns
neuraxial_catheter_dummies = pd.get_dummies(neuraxial_catheter_df.drop(columns=['anes_procedure_encounter_id_2273','unique_pt_id'],axis=1), columns=categorical_cols, drop_first=False)

# Compute the correlation matrix using Pearson correlation by default
correlation_matrix = neuraxial_catheter_dummies.corr()

plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix for neuraxial_catheter_df (with dummies)")
plt.show()


# Delete non-predictive features (from neuraxial_catheter_df, not epidural_df)

In [145]:
non_predictive_columns = ['maternal_race','has_scoliosis','composite_SES_advantage']
neuraxial_catheter_df = neuraxial_catheter_df.drop(columns=non_predictive_columns,axis=1,errors='ignore')

# Delete unknowable columns to prevent data leakage

In [146]:
data_leakage_columns = ['rom_thru_delivery_hours','placement_to_delivery_hours']

neuraxial_catheter_df = neuraxial_catheter_df.drop(data_leakage_columns, axis=1,errors='ignore')
epidural_df = epidural_df.drop(data_leakage_columns, axis=1,errors='ignore')

# Scale numerical values

In [147]:
for col in ['bmi_end_pregnancy_2044', 'baby_weight_2196', 'gestational_age_weeks', 'maternal_age_years']:
    neuraxial_catheter_df[col] = neuraxial_catheter_df[col] - neuraxial_catheter_df[col].median()
    epidural_df[col] = epidural_df[col] - epidural_df[col].median()

In [None]:
neuraxial_catheter_df['gestational_age_weeks'].describe()

# Logistic Regression Model

### Random data for model comparison

In [148]:
test_dataset = neuraxial_catheter_df.copy()
failure_rate = test_dataset['failed_catheter'].mean()
test_dataset['failed_catheter'] = np.random.binomial(n=1, p=failure_rate, size=len(test_dataset))

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

# Load the dataset
data = test_dataset

# Drop columns with more than 80% missing values
threshold = len(data) * 0.2
data_cleaned = data.dropna(thresh=threshold, axis=1)

# Drop rows where target variable is missing
data_cleaned = data_cleaned.dropna(subset=["failed_catheter"])

# Separate features and target variable
X = data_cleaned.drop(columns=["failed_catheter"], errors='ignore')
y = data_cleaned["failed_catheter"]

##############################################################################
# 1. Extract the group labels and remove them from X if it's just an ID column
##############################################################################
groups = X['unique_pt_id']  # Save group labels
# If you do NOT want to use `anes_procedure_encounter_id_2273` as a feature:
X = X.drop(columns=["unique_pt_id"])  

##############################################################################
# 2. Split using GroupShuffleSplit instead of train_test_split
##############################################################################
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Train logistic regression with class weights
logistic_model = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced', n_jobs=1)
logistic_model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = logistic_model.predict(X_test_preprocessed)
y_pred_prob = logistic_model.predict_proba(X_test_preprocessed)[:, 1]

# Evaluate the model
evaluation_metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_pred_prob),
    "classification_report": classification_report(y_test, y_pred)
}

# Print evaluation metrics
print("TEST RANDOM Model Evaluation:")
for metric, value in evaluation_metrics.items():
    if metric == "classification_report":
        print("\nTEST RANDOM Classification Report:\n", value)
    else:
        print(f"{metric.capitalize()}: {value:.4f}")


### Real LOGIT regression

In [150]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

import statsmodels.api as sm

# 1. Load and clean data
data = neuraxial_catheter_df.copy()

# Drop columns with more than 80% missing values
threshold = len(data) * 0.2
data_cleaned = data.dropna(thresh=threshold, axis=1)
data_cleaned = data_cleaned.dropna(subset=["failed_catheter"])

X = data_cleaned.drop(columns=["failed_catheter"], errors='ignore')
y = data_cleaned["failed_catheter"]

# 2. Group-aware split
groups = X['unique_pt_id']
X = X.drop(columns=["anes_procedure_encounter_id_2273","unique_pt_id"])

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()

# 3. Identify numeric vs. categorical
numeric_features = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "bool"]).columns.tolist()

# 4. Impute numeric
num_imputer = SimpleImputer(strategy='median')
X_train[numeric_features] = num_imputer.fit_transform(X_train[numeric_features])
X_test[numeric_features] = num_imputer.transform(X_test[numeric_features])

# 5. Scale numeric
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# 6. Impute categorical
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_features] = cat_imputer.fit_transform(X_train[categorical_features])
X_test[categorical_features] = cat_imputer.transform(X_test[categorical_features])

# 7. One-hot encode categorical
X_train = pd.get_dummies(X_train, columns=categorical_features, drop_first=True,dtype=int).astype(float)
X_test = pd.get_dummies(X_test, columns=categorical_features, drop_first=True,dtype=int).astype(float)

X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# 8. Fit logistic regression with Statsmodels
X_train_const = sm.add_constant(X_train)
logit_model = sm.Logit(y_train, X_train_const)
result = logit_model.fit(disp=0)

# 9. Predict
X_test_const = sm.add_constant(X_test, has_constant='add')
y_pred_prob = result.predict(X_test_const)

evaluation_metrics_by_threshold = []

for i in range(0, 21):
    prediction_threshold = i / 20
    y_pred = (y_pred_prob >= prediction_threshold).astype(int)

    # 10. Evaluate
    evaluation_metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred,zero_division=np.nan),
        "recall": recall_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_pred_prob),
        # "classification_report": classification_report(y_test, y_pred)
    }
    evaluation_metrics_by_threshold.append(evaluation_metrics)

result_summ = result.summary(alpha = 0.001)

# print("Model Evaluation:")
# for metric, value in evaluation_metrics.items():
#     if metric == "classification_report":
#         print("\nClassification Report:\n", value)
#     else:
#         print(f"{metric.capitalize()}: {value:.4f}")


In [None]:
evaluation_metrics_by_threshold = pd.DataFrame(evaluation_metrics_by_threshold)
evaluation_metrics_by_threshold

In [152]:
logit_result_table = result_summ.tables[0].data

In [153]:
logit_predictor_table = pd.DataFrame(result_summ.tables[1].data)
# 1. Set the first row as the new column headers.
logit_predictor_table.columns = logit_predictor_table.iloc[0]

# 2. Remove the first row (since it's now serving as header).
logit_predictor_table = logit_predictor_table[1:]

# 3. Set the first column (after the column headers update) as the index.
# Here, `df.columns[0]` represents the name of the first column.
logit_predictor_table = logit_predictor_table.set_index(logit_predictor_table.columns[0])

# 4. Sort by the 'P>|z|' column in ascending order.
logit_predictor_table = logit_predictor_table.sort_values('P>|z|')


In [None]:
# Calculate the odds ratio (OR) and the 95% confidence intervals
logit_predictor_table['OR'] = np.exp(logit_predictor_table['coef'].astype(float))
logit_predictor_table['OR_lower'] = np.exp(logit_predictor_table['[0.0005'].astype(float))
logit_predictor_table['OR_upper'] = np.exp(logit_predictor_table['0.9995]'].astype(float))

# Create the 'OR (95% CI)' column
logit_predictor_table['OR (99.9% CI)'] = logit_predictor_table.apply(
    lambda row: f"{row['OR']:.2f} ({row['OR_lower']:.2f} - {row['OR_upper']:.2f})", axis=1)

logit_predictor_table

In [None]:
logit_predictor_table.shape

In [None]:
logit_predictor_table.index

In [157]:
patient_factors = ['delivery_site_bwh',
       'parity_2048', 'gestational_age_weeks',
       'presentation_cephalic',
       'has_dorsalgia',
       'maternal_age_years', 'has_back_problems',
       'prior_failed_catheters_prev_enc',
       'baby_weight_2196',
       'composite_psychosocial_problems', 'prior_failed_catheters_this_enc',
       'position_posterior_or_transverse', 'prior_pain_scores_max',
       'bmi_end_pregnancy_2044', 'labor_induction']

procedural_factors = ['true_procedure_type_incl_dpe_cse', 
       'true_procedure_type_incl_dpe_dpe',
       'highly_experienced_anesthesiologist_none', 'lor_depth', 'number_of_neuraxial_attempts',
       'paresthesias_present',
       'highly_experienced_resident_no',
       'highly_experienced_anesthesiologist_yes',
       'highly_experienced_resident_yes',
       'true_procedure_type_incl_dpe_intrathecal']

In [166]:
patient_df = logit_predictor_table.loc[patient_factors].copy()
procedural_df = logit_predictor_table.loc[procedural_factors].copy()

rename_map = {
    'gestational_age_weeks': 'Gestational Age (per week)',
    'delivery_site_bwh': 'Delivery at flagship obstetric teaching hospital (vs other)',
    'baby_weight_2196': 'Baby Weight (per kg)',
    'bmi_end_pregnancy_2044': 'BMI (per kg/m^2)',
    'parity_2048': 'Parity (per birth)',
    'has_dorsalgia': 'Back pain (vs none)',
    'has_back_problems': 'Scoliosis or other back problems (vs none)',
    'prior_pain_scores_max': 'Max pain score prior to placement (per unit 0-10)',
    'composite_psychosocial_problems': 'Psychosocial risk factors (vs none)',
    'prior_failed_catheters_this_enc': 'Prior failed catheters in this encounter (per failure)',
    'prior_failed_catheters_prev_enc': 'Prior failed catheters in prior encounters (per failure)',
    'maternal_age_years': 'Maternal age (per year)',
    'labor_induction': 'Induced labor (vs not)',
    'position_posterior_or_transverse': 'Posterior or transverse fetal position (vs other)',
    'presentation_cephalic': 'Cephalic fetal presentation (vs other)',
    # procedural factors below
    'lor_depth': 'Depth to loss of resistance (per cm)',
    'highly_experienced_anesthesiologist_yes': 'Highly experienced attending anesthesiologist (vs less experienced)',
    'highly_experienced_anesthesiologist_none': 'No attending anesthesiologist (vs less experienced attending)',
    'highly_experienced_resident_yes': 'Highly experienced resident (vs no resident)',
    'highly_experienced_resident_no': 'Less experienced resident (vs no resident)',
    'paresthesias_present': 'Paresthesias present during placement (vs none)',
    'number_of_neuraxial_attempts': 'Number of placement attempts (per attempt)',
    'true_procedure_type_incl_dpe_intrathecal': 'Intrathecal catheter (vs conventional epidural)',
    'true_procedure_type_incl_dpe_dpe': 'Dural puncture epidural (vs conventional epidural)',
    'true_procedure_type_incl_dpe_cse': 'Combined spinal-epidural (vs conventional epidural)'
}

patient_df = patient_df.rename(index=rename_map)
procedural_df = procedural_df.rename(index=rename_map)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

def plot_forest_colored_with_markers(
    ax, 
    df, 
    title='Forest Plot', 
    x_label='Odds Ratio (99.9% CI)',
    x_min=0.5, 
    x_max=1.5
):
    """
    Plot a forest chart on 'ax' given a DataFrame 'df' with columns:
      - 'OR'
      - 'OR_lower'
      - 'OR_upper'
    
    X-axis is restricted to [x_min, x_max].
    
    Rules:
      - If the center OR is outside [x_min, x_max], skip plotting its dot.
      - If the OR or any part of its CI is beyond [x_min, x_max], place '<' or '>' at that boundary.
      - Print "OR X.XX (L.LL, U.UU)" above each data point in the same color.
      - Color each factor's name on the y-axis to match that factor's color.
    
    Color scheme for significance:
      - red if entire CI > 1
      - blue if entire CI < 1
      - black otherwise
    """

    # Sort by OR if you want smaller/larger ORs in order
    df = df.sort_values('OR')

    # We'll manually set the y-ticks, one row per factor
    y_positions = np.arange(len(df))

    # We won't set the yticklabels yet; we'll do them manually to color each label.
    ax.set_yticks(y_positions)
    # Temporarily set them all to blank
    ax.set_yticklabels([""] * len(df))

    # We'll collect the color for each row, so we can color the labels afterward
    factor_colors = []

    # Plot each factor individually
    for y_pos, (idx, row) in zip(y_positions, df.iterrows()):
        or_val = row['OR']
        ci_low = row['OR_lower']
        ci_high = row['OR_upper']

        # Decide the color based on significance
        if ci_low > 1:
            c = 'red'    # entire CI above 1 => significant risk
        elif ci_high < 1:
            c = 'blue'   # entire CI below 1 => significant protective
        else:
            c = 'black'  # not significant

        factor_colors.append(c)

        # Check if OR or CI extends beyond the plot range
        outside_left = (or_val < x_min) or (ci_low < x_min)
        outside_right = (or_val > x_max) or (ci_high > x_max)

        # If the center OR is out of range, skip the dot
        center_outside = (or_val < x_min) or (or_val > x_max)
        dot_fmt = 'none' if center_outside else 'o'

        # Calculate the full error bar from the center
        left_err = or_val - ci_low
        right_err = ci_high - or_val

        # Plot the error bar (may or may not include the dot)
        ax.errorbar(
            or_val,
            y_pos,
            xerr=[[left_err], [right_err]],
            fmt=dot_fmt,   # skip the dot if center is outside
            color=c,
            ecolor=c,
            capsize=4
        )

        # Place boundary markers if the OR or any part of CI is outside
        if outside_left:
            ax.text(
                x_min, y_pos, '<', 
                va='center', ha='right', color=c, fontsize=14
            )
        if outside_right:
            ax.text(
                x_max, y_pos, '>', 
                va='center', ha='left', color=c, fontsize=14
            )

        # Prepare the label "OR X.XX (L.LL - U.UU)"
        label_str = f"OR {or_val:.2f} ({ci_low:.2f} - {ci_high:.2f})"

        # Place the label just above the data point (or boundary)
        # We'll define a small offset in Y to shift text "above" the marker
        label_offset = 0.2  # Adjust as needed
        label_y = y_pos - label_offset  # axis is inverted => subtract to go "up"
        ax.text(
            1.08, label_y,
            label_str,
            va='bottom',   # text rises from the point
            ha='center',
            color=c,
            fontsize=10
        )

    # Now color each factor name using the same color
    # We already set blank y-ticklabels, so let's manually place them:
    for y_pos, (idx, c) in zip(y_positions, zip(df.index, factor_colors)):
        # We'll place the text a bit left of x_min so it doesn't collide with the plot
        ax.text(
            x_min - 0.05, y_pos,
            idx,
            va='center', ha='right',
            color=c,
            fontsize=12
        )

    # Draw a vertical line at OR=1
    ax.axvline(x=1, color='gray', linestyle='--')

    # Invert y-axis so the top row is at the top
    ax.invert_yaxis()

    # Limit the x-axis
    ax.set_xlim([x_min, x_max])

    # Add labels
    ax.set_xlabel(x_label, fontsize=14)
    ax.set_title(title, fontsize=16)


# ==========================
# Example usage with TWO subplots
# ==========================

# Suppose you have:
#   patient_df
#   procedural_df
# Each with columns: ['OR', 'OR_lower', 'OR_upper']
# and index = factor names.

# Create the figure with two columns
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15, 8))

plot_forest_colored_with_markers(
    ax=ax1,
    df=patient_df,
    title='Patient Factors',
    x_label='Odds Ratio (99.9% CI)',
    x_min=0.5,
    x_max=1.5
)

plot_forest_colored_with_markers(
    ax=ax2,
    df=procedural_df,
    title='Procedural Factors',
    x_label='Odds Ratio (99.9% CI)',
    x_min=0.5,
    x_max=1.5
)

# Create a manual legend for color interpretation:
protect_marker = mlines.Line2D([], [], color='blue', marker='o', linestyle='None',
                               label='Significant protective factor')
ns_marker = mlines.Line2D([], [], color='black', marker='o', linestyle='None',
                          label='Not significant')
risk_marker = mlines.Line2D([], [], color='red', marker='o', linestyle='None',
                            label='Significant risk factor')

fig.legend(
    handles=[protect_marker, ns_marker, risk_marker],
    loc='upper center',
    bbox_to_anchor=(0.5, 1.05),
    ncol=3,
    fontsize=12
)

plt.tight_layout()
plt.show()


## SKLearn Logistic Regression

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

# Load the dataset
data = neuraxial_catheter_df

# Drop columns with more than 80% missing values
threshold = len(data) * 0.2
data_cleaned = data.dropna(thresh=threshold, axis=1)

# Drop rows where target variable is missing
data_cleaned = data_cleaned.dropna(subset=["failed_catheter"])

# Separate features and target variable
X = data_cleaned.drop(columns=["failed_catheter"], errors='ignore')
y = data_cleaned["failed_catheter"]


## Drop delivery_site
# X = X.drop(columns=["delivery_site"],errors='ignore')

##############################################################################
# 1. Extract the group labels and remove them from X if it's just an ID column
##############################################################################
groups = X['unique_pt_id']  # Save group labels
# If you do NOT want to use `anes_procedure_encounter_id_2273` as a feature:
X = X.drop(columns=["anes_procedure_encounter_id_2273","unique_pt_id"])  

##############################################################################
# 2. Split using GroupShuffleSplit instead of train_test_split
##############################################################################
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Train logistic regression with class weights
logistic_model = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced', n_jobs=1)
logistic_model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = logistic_model.predict(X_test_preprocessed)
y_pred_prob = logistic_model.predict_proba(X_test_preprocessed)[:, 1]

# Evaluate the model
evaluation_metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_pred_prob),
    "classification_report": classification_report(y_test, y_pred)
}

# Print evaluation metrics
print("Model Evaluation:")
for metric, value in evaluation_metrics.items():
    if metric == "classification_report":
        print("\nClassification Report:\n", value)
    else:
        print(f"{metric.capitalize()}: {value:.4f}")


In [None]:
# 1. Get the feature names produced by the ColumnTransformer
feature_names = preprocessor.get_feature_names_out()

# 2. Get the coefficients from the trained logistic model
#    For binary classification, .coef_ is an array of shape (1, n_features)
coefficients = logistic_model.coef_[0]


# 3. Combine feature names with coefficients into a list of tuples
coef_pairs = list(zip(feature_names, coefficients))

# 4. Print the coefficients sorted by absolute magnitude
print("Coefficients for each feature (sorted by absolute magnitude):")
for name, coef in sorted(coef_pairs, key=lambda x: abs(x[1]), reverse=True):
    print(f"  {name}: {coef:.4f}")

# 5. Print the coefficients sorted alphabetically
print('---------------------------------------------')
print("Coefficients for each feature (sorted alphabetically):")
for name, coef in sorted(coef_pairs):
    print(f"  {name}: {coef:.4f}")

# 6. Print the intercept
print(f"Intercept: {logistic_model.intercept_[0]:.4f}")


# XGBoost

In [None]:
##############################################################################
# Imports
##############################################################################
from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

# Import XGBoost
from xgboost import XGBClassifier

##############################################################################
# 0. Load and prepare the dataset
##############################################################################
data = neuraxial_catheter_df

# Drop columns with more than 80% missing values
threshold = len(data) * 0.2
data_cleaned = data.dropna(thresh=threshold, axis=1)

# Drop rows where target variable is missing
data_cleaned = data_cleaned.dropna(subset=["failed_catheter"])

# Separate features and target variable
X = data_cleaned.drop(columns=["failed_catheter"], errors='ignore')
y = data_cleaned["failed_catheter"]

# # Drop delivery_site
# X = X.drop(columns=["delivery_site"],errors='ignore')

##############################################################################
# 1. Extract the group labels and remove them from X if it's just an ID column
##############################################################################
groups = X['unique_pt_id']  # Save group labels
X = X.drop(columns=["anes_procedure_encounter_id_2273","unique_pt_id"])  # remove ID column from features

##############################################################################
# 2. Split using GroupShuffleSplit instead of train_test_split
##############################################################################
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

##############################################################################
# 3. Identify numeric and categorical columns
##############################################################################
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

##############################################################################
# 4. Create preprocessing pipelines
##############################################################################
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

##############################################################################
# 5. Preprocess the data
##############################################################################
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

##############################################################################
# 6. Train XGBoost Classifier
##############################################################################
# Instantiate the XGBClassifier
# Note: You can tune parameters such as 'scale_pos_weight' if your data is imbalanced.
xgb_model = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=(y_train.shape[0] - y_train.sum()) / y_train.sum()
)
xgb_model.fit(X_train_preprocessed, y_train)

##############################################################################
# 7. Make predictions
##############################################################################
y_pred = xgb_model.predict(X_test_preprocessed)
y_pred_prob = xgb_model.predict_proba(X_test_preprocessed)[:, 1]

##############################################################################
# 8. Evaluate the model
##############################################################################
evaluation_metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_pred_prob),
    "classification_report": classification_report(y_test, y_pred)
}

##############################################################################
# 9. Print the evaluation metrics
##############################################################################
print("Model Evaluation:")
for metric, value in evaluation_metrics.items():
    if metric == "classification_report":
        print("\nClassification Report:\n", value)
    else:
        print(f"{metric.capitalize()}: {value:.4f}")


# Propensity Scores

## Propensity Scoring for DPE

In [None]:
import pandas as pd
import numpy as np

# For logistic regression and nearest neighbor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# For imputation and scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# For statistical inference (CIs, p-values)
import statsmodels.api as sm

# ------------------------------------------------------------------------------
# 1. Copy your original dataframe
# ------------------------------------------------------------------------------
df = neuraxial_catheter_df.copy()
df['dpe'] = (df['true_procedure_type_incl_dpe'] == 'dpe').astype(int)
df.drop(columns=['true_procedure_type_incl_dpe'], inplace=True)

# Columns for the treatment and outcome
treatment_col = 'dpe'
outcome_col   = 'failed_catheter'

# ------------------------------------------------------------------------------
# 2. Identify numeric vs. categorical columns (excluding treatment & outcome)
# ------------------------------------------------------------------------------
# If 'dpe' or 'failed_catheter' happen to be numeric, we still exclude them from imputation.
numeric_cols = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in [treatment_col, outcome_col]
]
categorical_cols = [
    col for col in df.columns
    if col not in numeric_cols and col not in [treatment_col, outcome_col]
]

# ------------------------------------------------------------------------------
# 3. Impute missing data
#    - Median for numeric
#    - Most frequent for categorical
# ------------------------------------------------------------------------------
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit/transform numeric columns
df_num = pd.DataFrame(
    num_imputer.fit_transform(df[numeric_cols]),
    columns=numeric_cols
)

# Fit/transform categorical columns
df_cat = pd.DataFrame(
    cat_imputer.fit_transform(df[categorical_cols]),
    columns=categorical_cols
)

# ------------------------------------------------------------------------------
# 4. One-hot encode (dummy) the categorical columns
# ------------------------------------------------------------------------------
df_cat_encoded = pd.get_dummies(df_cat, drop_first=True)

# ------------------------------------------------------------------------------
# 5. Combine imputed numeric + encoded categorical with original treatment/outcome
# ------------------------------------------------------------------------------
# Reattach treatment/outcome columns to the front, for convenience
df_imputed = pd.concat(
    [
        df[[treatment_col, outcome_col]].reset_index(drop=True),
        df_num.reset_index(drop=True),
        df_cat_encoded.reset_index(drop=True)
    ],
    axis=1
)

# ------------------------------------------------------------------------------
# 6. Standardize numeric features (optional but often recommended)
#    Identify which columns in df_num still exist in df_imputed
# ------------------------------------------------------------------------------
scaler = StandardScaler()
df_num_scaled = pd.DataFrame(
    scaler.fit_transform(df_imputed[numeric_cols]),
    columns=numeric_cols
)

# Now replace the unscaled numeric columns in df_imputed
for col in numeric_cols:
    df_imputed[col] = df_num_scaled[col]

# ------------------------------------------------------------------------------
# 7. Fit the propensity model (LogisticRegression) on all columns except
#    the treatment and outcome columns.
# ------------------------------------------------------------------------------
feature_cols = [c for c in df_imputed.columns if c not in [treatment_col, outcome_col]]

X = df_imputed[feature_cols].values  # all imputed & encoded features
y = df_imputed[treatment_col].values # the treatment indicator (dpe)

propensity_model = LogisticRegression(solver='lbfgs', max_iter=1000)
propensity_model.fit(X, y)

# Probability of dpe=1
propensity_scores = propensity_model.predict_proba(X)[:, 1]
df_imputed['propensity_score'] = propensity_scores

# ------------------------------------------------------------------------------
# 8. Separate treated vs. control and do nearest-neighbor matching
# ------------------------------------------------------------------------------
treated = df_imputed[df_imputed[treatment_col] == 1].copy()
control = df_imputed[df_imputed[treatment_col] == 0].copy()

treated_scores = treated[['propensity_score']].values
control_scores = control[['propensity_score']].values

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(control_scores)

distances, indices = nn.kneighbors(treated_scores)
distances = distances.flatten()
indices = indices.flatten()

matched_treated = treated.copy()
matched_control = control.iloc[indices].copy()

# Combine matched sample
matched_data = pd.concat([matched_treated, matched_control], axis=0).reset_index(drop=True)

# ------------------------------------------------------------------------------
# 9. Fit an outcome model on the matched sample
#    We'll use statsmodels for confidence intervals and p-values.
# ------------------------------------------------------------------------------
matched_data['intercept'] = 1.0

# We'll just use dpe (and intercept) in the outcome model here
X_outcome = matched_data[['intercept', treatment_col]]
y_outcome = matched_data[outcome_col]

logit_sm = sm.Logit(y_outcome, X_outcome)
result_sm = logit_sm.fit(disp=0)  # disp=0 hides optimization output

print(result_sm.summary())

# Extract OR & 95% CI
params = result_sm.params
conf = result_sm.conf_int()
odds_ratios = np.exp(params)
conf_odds = np.exp(conf)

print("\nOdds Ratios:\n", odds_ratios)
print("\n95% Confidence Intervals:\n", conf_odds)


## Propensity Scoring for CSE

In [None]:
import pandas as pd
import numpy as np

# For logistic regression and nearest neighbor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# For imputation and scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# For statistical inference (CIs, p-values)
import statsmodels.api as sm

# ------------------------------------------------------------------------------
# 1. Copy your original dataframe
# ------------------------------------------------------------------------------
df = neuraxial_catheter_df.copy()
df['cse'] = (df['true_procedure_type_incl_dpe'] == 'cse').astype(int)
df.drop(columns=['true_procedure_type_incl_dpe'], inplace=True)

# Columns for the treatment and outcome
treatment_col = 'cse'
outcome_col   = 'failed_catheter'

# ------------------------------------------------------------------------------
# 2. Identify numeric vs. categorical columns (excluding treatment & outcome)
# ------------------------------------------------------------------------------
# If 'dpe' or 'failed_catheter' happen to be numeric, we still exclude them from imputation.
numeric_cols = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in [treatment_col, outcome_col]
]
categorical_cols = [
    col for col in df.columns
    if col not in numeric_cols and col not in [treatment_col, outcome_col]
]

# ------------------------------------------------------------------------------
# 3. Impute missing data
#    - Median for numeric
#    - Most frequent for categorical
# ------------------------------------------------------------------------------
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit/transform numeric columns
df_num = pd.DataFrame(
    num_imputer.fit_transform(df[numeric_cols]),
    columns=numeric_cols
)

# Fit/transform categorical columns
df_cat = pd.DataFrame(
    cat_imputer.fit_transform(df[categorical_cols]),
    columns=categorical_cols
)

# ------------------------------------------------------------------------------
# 4. One-hot encode (dummy) the categorical columns
# ------------------------------------------------------------------------------
df_cat_encoded = pd.get_dummies(df_cat, drop_first=True)

# ------------------------------------------------------------------------------
# 5. Combine imputed numeric + encoded categorical with original treatment/outcome
# ------------------------------------------------------------------------------
# Reattach treatment/outcome columns to the front, for convenience
df_imputed = pd.concat(
    [
        df[[treatment_col, outcome_col]].reset_index(drop=True),
        df_num.reset_index(drop=True),
        df_cat_encoded.reset_index(drop=True)
    ],
    axis=1
)

# ------------------------------------------------------------------------------
# 6. Standardize numeric features (optional but often recommended)
#    Identify which columns in df_num still exist in df_imputed
# ------------------------------------------------------------------------------
scaler = StandardScaler()
df_num_scaled = pd.DataFrame(
    scaler.fit_transform(df_imputed[numeric_cols]),
    columns=numeric_cols
)

# Now replace the unscaled numeric columns in df_imputed
for col in numeric_cols:
    df_imputed[col] = df_num_scaled[col]

# ------------------------------------------------------------------------------
# 7. Fit the propensity model (LogisticRegression) on all columns except
#    the treatment and outcome columns.
# ------------------------------------------------------------------------------
feature_cols = [c for c in df_imputed.columns if c not in [treatment_col, outcome_col]]

X = df_imputed[feature_cols].values  # all imputed & encoded features
y = df_imputed[treatment_col].values # the treatment indicator (dpe)

propensity_model = LogisticRegression(solver='lbfgs', max_iter=1000)
propensity_model.fit(X, y)

# Probability of dpe=1
propensity_scores = propensity_model.predict_proba(X)[:, 1]
df_imputed['propensity_score'] = propensity_scores

# ------------------------------------------------------------------------------
# 8. Separate treated vs. control and do nearest-neighbor matching
# ------------------------------------------------------------------------------
treated = df_imputed[df_imputed[treatment_col] == 1].copy()
control = df_imputed[df_imputed[treatment_col] == 0].copy()

treated_scores = treated[['propensity_score']].values
control_scores = control[['propensity_score']].values

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(control_scores)

distances, indices = nn.kneighbors(treated_scores)
distances = distances.flatten()
indices = indices.flatten()

matched_treated = treated.copy()
matched_control = control.iloc[indices].copy()

# Combine matched sample
matched_data = pd.concat([matched_treated, matched_control], axis=0).reset_index(drop=True)

# ------------------------------------------------------------------------------
# 9. Fit an outcome model on the matched sample
#    We'll use statsmodels for confidence intervals and p-values.
# ------------------------------------------------------------------------------
matched_data['intercept'] = 1.0

# We'll just use dpe (and intercept) in the outcome model here
X_outcome = matched_data[['intercept', treatment_col]]
y_outcome = matched_data[outcome_col]

logit_sm = sm.Logit(y_outcome, X_outcome)
result_sm = logit_sm.fit(disp=0)  # disp=0 hides optimization output

print(result_sm.summary())

# Extract OR & 95% CI
params = result_sm.params
conf = result_sm.conf_int()
odds_ratios = np.exp(params)
conf_odds = np.exp(conf)

print("\nOdds Ratios:\n", odds_ratios)
print("\n95% Confidence Intervals:\n", conf_odds)

del df

## Propensity scoring for DPE vs CSE

In [None]:
import pandas as pd
import numpy as np

# For logistic regression and nearest neighbor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# For imputation and scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# For statistical inference (CIs, p-values)
import statsmodels.api as sm

# ------------------------------------------------------------------------------
# 1. Copy your original dataframe
# ------------------------------------------------------------------------------
df = neuraxial_catheter_df.copy()
df = df[df['true_procedure_type_incl_dpe'].isin(['cse', 'dpe'])]
df['cse_not_dpe'] = (df['true_procedure_type_incl_dpe'] == 'cse').astype(int)
df.drop(columns=['true_procedure_type_incl_dpe'], inplace=True)

# Columns for the treatment and outcome
treatment_col = 'cse_not_dpe'
outcome_col   = 'failed_catheter'

# ------------------------------------------------------------------------------
# 2. Identify numeric vs. categorical columns (excluding treatment & outcome)
# ------------------------------------------------------------------------------
# If 'dpe' or 'failed_catheter' happen to be numeric, we still exclude them from imputation.
numeric_cols = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in [treatment_col, outcome_col]
]
categorical_cols = [
    col for col in df.columns
    if col not in numeric_cols and col not in [treatment_col, outcome_col]
]

# ------------------------------------------------------------------------------
# 3. Impute missing data
#    - Median for numeric
#    - Most frequent for categorical
# ------------------------------------------------------------------------------
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit/transform numeric columns
df_num = pd.DataFrame(
    num_imputer.fit_transform(df[numeric_cols]),
    columns=numeric_cols
)

# Fit/transform categorical columns
df_cat = pd.DataFrame(
    cat_imputer.fit_transform(df[categorical_cols]),
    columns=categorical_cols
)

# ------------------------------------------------------------------------------
# 4. One-hot encode (dummy) the categorical columns
# ------------------------------------------------------------------------------
df_cat_encoded = pd.get_dummies(df_cat, drop_first=True)

# ------------------------------------------------------------------------------
# 5. Combine imputed numeric + encoded categorical with original treatment/outcome
# ------------------------------------------------------------------------------
# Reattach treatment/outcome columns to the front, for convenience
df_imputed = pd.concat(
    [
        df[[treatment_col, outcome_col]].reset_index(drop=True),
        df_num.reset_index(drop=True),
        df_cat_encoded.reset_index(drop=True)
    ],
    axis=1
)

# ------------------------------------------------------------------------------
# 6. Standardize numeric features (optional but often recommended)
#    Identify which columns in df_num still exist in df_imputed
# ------------------------------------------------------------------------------
scaler = StandardScaler()
df_num_scaled = pd.DataFrame(
    scaler.fit_transform(df_imputed[numeric_cols]),
    columns=numeric_cols
)

# Now replace the unscaled numeric columns in df_imputed
for col in numeric_cols:
    df_imputed[col] = df_num_scaled[col]

# ------------------------------------------------------------------------------
# 7. Fit the propensity model (LogisticRegression) on all columns except
#    the treatment and outcome columns.
# ------------------------------------------------------------------------------
feature_cols = [c for c in df_imputed.columns if c not in [treatment_col, outcome_col]]

X = df_imputed[feature_cols].values  # all imputed & encoded features
y = df_imputed[treatment_col].values # the treatment indicator (dpe)

propensity_model = LogisticRegression(solver='lbfgs', max_iter=1000)
propensity_model.fit(X, y)

# Probability of dpe=1
propensity_scores = propensity_model.predict_proba(X)[:, 1]
df_imputed['propensity_score'] = propensity_scores

# ------------------------------------------------------------------------------
# 8. Separate treated vs. control and do nearest-neighbor matching
# ------------------------------------------------------------------------------
treated = df_imputed[df_imputed[treatment_col] == 1].copy()
control = df_imputed[df_imputed[treatment_col] == 0].copy()

treated_scores = treated[['propensity_score']].values
control_scores = control[['propensity_score']].values

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(control_scores)

distances, indices = nn.kneighbors(treated_scores)
distances = distances.flatten()
indices = indices.flatten()

matched_treated = treated.copy()
matched_control = control.iloc[indices].copy()

# Combine matched sample
matched_data = pd.concat([matched_treated, matched_control], axis=0).reset_index(drop=True)

# ------------------------------------------------------------------------------
# 9. Fit an outcome model on the matched sample
#    We'll use statsmodels for confidence intervals and p-values.
# ------------------------------------------------------------------------------
matched_data['intercept'] = 1.0

# We'll just use dpe (and intercept) in the outcome model here
X_outcome = matched_data[['intercept', treatment_col]]
y_outcome = matched_data[outcome_col]

logit_sm = sm.Logit(y_outcome, X_outcome)
result_sm = logit_sm.fit(disp=0)  # disp=0 hides optimization output

print(result_sm.summary())

# Extract OR & 95% CI
params = result_sm.params
conf = result_sm.conf_int()
odds_ratios = np.exp(params)
conf_odds = np.exp(conf)

print("\nOdds Ratios:\n", odds_ratios)
print("\n95% Confidence Intervals:\n", conf_odds)

del df

# Next Steps

XGBoost - hyperparameter tuning with Optuna

Shapley values for interpretability

Eliminate features that are both poorly predictive and have lots of missing data

Abstract functions and separate them into different files

Fewer features will improve interpretability