In [5]:
import numpy as np
import pandas as pd

# Load Data

In [6]:
df = pd.read_csv('minimal_merlin_data.csv') 

  df = pd.read_csv('minimal_merlin_data.csv')


In [21]:
df = df.replace({True: 1, False: 0})

  df = df.replace({True: 1, False: 0})


In [22]:
# Filter the DataFrame to include only neuraxial catheter (ie, epidural + CSE + intrathecal) or epidural-only catheter procedures
neuraxial_catheter_df = df[df['is_neuraxial_catheter'] == 1]
epidural_df = df[(df['true_procedure_type'] == 'epidural')]

# Describe Dataframe

There are 158364 total rows, of which 22218 have NaN true_procedure_type.

Every row receives a value for all Boolean variables: thus if no value is present, they become False. Furthermore, NaN procedures become False is_neuraxial_catheter and failed_catheter.

is_neuraxial_catheter includes epidurals + CSEs + intrathecals

failed_catheter is applied to BOTH neuraxial_catheters (which may be coded True or False for failure) and also to all procedures that are not neuraxial_catheters (will always be coded False).

In [23]:
df.shape

(158364, 35)

In [24]:
def describe_dataframe(df):
    """
    For each column in df:
      - If dtype is object or int64 or bool, list each unique value and its counts.
      - If dtype is float64, display min, Q1, median, Q3, and max.
      - Otherwise, handle accordingly (datetime, etc.).
    """
    for col in df.columns:
        col_type = df[col].dtype

        print(f"Column: {col}")
        print(f"  Data Type: {col_type}")

        if col_type == 'object' or col_type == 'int64' or col_type == 'bool':
            # Show unique values and their counts
            value_counts = df[col].value_counts(dropna=False)
            print("  Value counts:")
            for val, count in value_counts.items():
                print(f"    {val}: {count}")

        elif col_type == 'float64':
            # Show min, Q1 (25%), median (50%), Q3 (75%), and max
            desc = df[col].describe(percentiles=[0.25, 0.5, 0.75])
            na_count = df[col].isna().sum()
            print("  Summary stats:")
            print(f"    NaN:    {na_count}")
            print(f"    Min:    {desc['min']}")
            print(f"    Q1:     {desc['25%']}")
            print(f"    Median: {desc['50%']}")
            print(f"    Q3:     {desc['75%']}")
            print(f"    Max:    {desc['max']}")

        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            # Example handling for datetime columns
            print("  (Datetime column – no numeric summary or value counts shown.)")

        else:
            # Handle any other data types as needed
            print("  (No specific handling implemented for this data type.)")

        print("-" * 50)

describe_dataframe(df)


Column: gestational_age_2052
  Data Type: float64
  Summary stats:
    NaN:    2339
    Min:    87.0
    Q1:     267.0
    Median: 275.0
    Q3:     281.0
    Max:    308.0
--------------------------------------------------
Column: delivery_site_2188
  Data Type: object
  Value counts:
    bwh: 64334
    mgh: 35716
    nwh: 35053
    slm: 9973
    wdh: 7380
    cdh: 3901
    nch: 1002
    mvh: 982
    mgb: 14
    nsc: 9
--------------------------------------------------
Column: baby_weight_2196
  Data Type: float64
  Summary stats:
    NaN:    3251
    Min:    0.0
    Q1:     2.9766996
    Median: 3.3299346192
    Q3:     3.6551036136
    Max:    7.1698771032
--------------------------------------------------
Column: rom_thru_delivery_hours
  Data Type: float64
  Summary stats:
    NaN:    20054
    Min:    0.0
    Q1:     1.45
    Median: 5.45
    Q3:     12.066666666666666
    Max:    711.8166666666667
--------------------------------------------------
Column: fetal_presentation_cate

In [25]:
def describe_as_tables(df):
    # Separate columns by dtype
    categorical_cols = []
    numeric_cols = []

    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype == 'int64' or df[col].dtype == 'bool':
            categorical_cols.append(col)
        elif df[col].dtype == 'float64':
            numeric_cols.append(col)
        else:
            # skip or handle datetime, etc. if desired
            pass

    # --- Build table for categorical variables ---
    cat_data = {}
    for col in categorical_cols:
        # Get value counts (including NaN as a separate category)
        vc = df[col].value_counts(dropna=False)
        # Convert value counts to a dict, or a formatted string
        vc_str = ", ".join(f"{val}: {count}" for val, count in vc.items())
        cat_data[col] = {
            'value_counts': vc_str
        }
    cat_df = pd.DataFrame(cat_data).T  # Transpose so rows = columns, col = 'value_counts'

    # --- Build table for numeric variables ---
    num_data = {}
    for col in numeric_cols:
        desc = df[col].describe(percentiles=[0.25, 0.5, 0.75])
        na_count = df[col].isna().sum()
        num_data[col] = {
            'count': desc['count'],
            'count_nan': na_count,
            'min': desc['min'],
            'Q1': desc['25%'],
            'median': desc['50%'],
            'Q3': desc['75%'],
            'max': desc['max']
        }
    num_df = pd.DataFrame(num_data).T  # Transpose so rows = columns

    return cat_df, num_df

cat_table, num_table = describe_as_tables(df)


In [26]:
cat_table

Unnamed: 0,value_counts
delivery_site_2188,"bwh: 64334, mgh: 35716, nwh: 35053, slm: 9973,..."
fetal_presentation_category_2243,"cephalic: 134205, nan: 12656, breech: 7827, co..."
fetal_presentation_subcategory_2244,"vertex: 134037, nan: 20328, compound: 2990, tr..."
fetal_presentation_position_2247,"nan: 80824, anterior: 67514, posterior: 7255, ..."
true_procedure_type,"epidural: 93221, spinal: 25186, nan: 22402, cs..."
is_neuraxial_catheter,"1: 107523, 0: 50841"
failed_catheter,"0: 152023, 1: 6341"
dpe,"0: 142193, 1: 16171"
highly_experienced_anesthesiologist,"no: 63149, yes: 47811, none: 47404"
highly_experienced_resident,"none: 81897, no: 47821, yes: 28646"


In [27]:
num_table

Unnamed: 0,count,count_nan,min,Q1,median,Q3,max
gestational_age_2052,156025.0,2339.0,87.0,267.0,275.0,281.0,308.0
baby_weight_2196,155113.0,3251.0,0.0,2.9767,3.329935,3.655104,7.169877
rom_thru_delivery_hours,138310.0,20054.0,0.0,1.45,5.45,12.066667,711.816667
bmi_end_pregnancy_2044,150134.0,8230.0,5.8,27.0,30.1,34.2,69.9
maternal_weight_end_pregnancy_2045,152039.0,6325.0,0.089018,70.76041,79.786898,91.126707,218.997798
bmi_before_pregnancy_2161,109588.0,48776.0,6.66,22.1,25.0,29.3,67.7
gravidity_2047,107712.0,50652.0,0.0,1.0,2.0,3.0,18.0
parity_2048,94664.0,63700.0,0.0,0.0,1.0,1.0,12.0
lor_depth,105277.0,53087.0,0.0,4.8,5.0,6.0,18.0
current_resident_catheter_count,76467.0,81897.0,0.0,15.0,36.0,67.0,332.0


# Data Visualization

## Procedure Types

In [None]:
# prompt: make a histogram of procedure note types using different colors

# Assuming 'procedure_type' column exists in your DataFrame 'df'
procedure_type_counts = df['true_procedure_type'].value_counts()

plt.figure(figsize=(6, 6))
plt.bar(procedure_type_counts.index, procedure_type_counts.values, color=['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange'])
plt.xlabel('Procedure Type')
plt.ylabel('Count')
plt.title('Histogram of Procedure Note Types')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Histogram of successes/failures

# Group by procedure type and whether it has subsequent anesthesia
procedure_counts = pd.crosstab(neuraxial_catheter_df['true_procedure_type'], neuraxial_catheter_df['failed_catheter'])

# Sort the bars in descending order based on the total count of each procedure type
procedure_counts = procedure_counts.sort_values(by=False, ascending=False)

# Create a stacked bar chart
ax = procedure_counts.plot(kind='bar', stacked=True, figsize=(6
, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
  width = p.get_width()
  height = p.get_height()
  x, y = p.get_xy()
  ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Procedure Type')
plt.ylabel('Count')
plt.title('Histogram of Successful/Failed')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()


In [None]:
# Display the table with the same information
print("Table of Neuraxial Catheter Procedures by Success/Failure:")
print(procedure_counts)


## Anesthesiologist Experience

In [None]:
# prompt: Create a similar histogram for failure rate vs highly experienced anesthesiologist

# Group by 'highly_experienced_anesthesiologist' and 'failed_catheter'
experience_failure_counts = pd.crosstab(neuraxial_catheter_df['highly_experienced_anesthesiologist'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Anesthesiologist Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Anesthesiologist Experience')
plt.xticks(rotation=0, ha='center', ticks=[0,1,2], labels=['No Anesthesiologist','Not Highly Experienced', 'Highly Experienced'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Anesthesiologist Experience:")
experience_failure_counts

In [None]:
# prompt: create a similar histogram for failure rate vs moderately experienced anesthesiologist

# Group by 'moderately_experienced_anesthesiologist' and 'failed_catheter'
experience_failure_counts = pd.crosstab(neuraxial_catheter_df['moderately_experienced_anesthesiologist'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Anesthesiologist Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Moderately Experienced Anesthesiologist')
plt.xticks(rotation=0, ha='center', ticks=[0,1,2], labels=['No Anesthesiologist','Not Moderately Experienced', 'Moderately Experienced'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Moderately Experienced Anesthesiologist:")
experience_failure_counts

In [None]:
# prompt: Create a similar histogram for failure rate vs highly experienced resident

# Group by 'highly_experienced_resident' and 'failed_catheter'
experience_failure_counts = pd.crosstab(neuraxial_catheter_df['highly_experienced_resident'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Resident Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Resident Experience')
plt.xticks(rotation=0, ha='center', ticks=[0,1,2], labels=['No Resident','Not Highly Experienced', 'Highly Experienced'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Resident Experience:")
experience_failure_counts

In [None]:
# prompt: Create a similar histogram but look at all combinations of resident and anesthesiologist experience. Make the x-axis labels vertical.

# Group by 'highly_experienced_anesthesiologist', 'highly_experienced_resident', and 'failed_catheter'
experience_failure_counts = pd.crosstab([neuraxial_catheter_df['highly_experienced_anesthesiologist'], neuraxial_catheter_df['highly_experienced_resident']], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = experience_failure_counts.plot(kind='bar', stacked=True, figsize=(8, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Anesthesiologist and Resident Experience')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Anesthesiologist and Resident Experience')


# Customize x-axis labels
import itertools
anesth_levels = ["Anes=None", "Anes=Not Exp", "Anes=Exp"]
resident_levels = ["Res=None", "Res=Not Exp", "Res=Exp"]
labels = list(itertools.product(anesth_levels, resident_levels))
plt.xticks(rotation=90, ha='center', ticks=range(len(labels)), labels=labels)

plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Anesthesiologist and Resident Experience:")
experience_failure_counts

In [None]:
# prompt: crosstab resident experience by BMI and make violin plots

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'df' is your DataFrame (as defined in the provided code)
# and it contains columns 'bmi_end_pregnancy_2044' and 'resident_experience' (or a similar column)

# Create the cross-tabulation
crosstab_data = pd.crosstab(neuraxial_catheter_df['bmi_end_pregnancy_2044'], neuraxial_catheter_df['highly_experienced_resident'])

# Display the cross-tabulation
print("Crosstab of Resident Experience by BMI:")
print(crosstab_data)

# Create violin plots
plt.figure(figsize=(10, 6))
sns.violinplot(x='highly_experienced_resident', y='bmi_end_pregnancy_2044', data=df)
plt.xlabel('Resident Experience')  # Customize the x-axis label
plt.ylabel('BMI') # Customize the y-axis label
plt.title('Violin Plot of BMI by Resident Experience')
plt.show()

## Delivery Site

In [None]:
# prompt: create a similar histogram of delivery_site_2188 using crosstab

# Create a crosstab for 'delivery_site_2188' and visualize it as a histogram
delivery_site_counts = pd.crosstab(neuraxial_catheter_df['delivery_site_2188'], neuraxial_catheter_df['failed_catheter'])

# Sort the bars in descending order based on the total count of each delivery site
delivery_site_counts = delivery_site_counts.sort_values(by=False, ascending=False)

# Create a stacked bar chart
ax = delivery_site_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Delivery Site')
plt.ylabel('Count')
plt.title('Histogram of Delivery Site by Success/Failure')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Delivery Site by Success/Failure:")
delivery_site_counts

## DPE

In [None]:
# prompt: # prompt: create a pie chart of the fraction of DPE in epidural_df

# Count DPE values, treating NaN and '' as "no"
dpe_counts = epidural_df['dpe'].value_counts()

# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(dpe_counts, labels=dpe_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Fraction of DPE in Epidural Procedures')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# prompt: reproduce the above histogram using crosstab on delivery_site_2188 and dpe

# Assuming 'df' is your DataFrame (as defined in the provided code)

# Create a crosstab for 'delivery_site_2188' and 'dpe' and visualize it as a histogram
delivery_site_dpe_counts = pd.crosstab(epidural_df['delivery_site_2188'], epidural_df['dpe'])

# Sort the bars in descending order based on the total count of each delivery site
delivery_site_dpe_counts = delivery_site_dpe_counts.sort_values(by=True, ascending=False) # Sort by 'no'

# Create a stacked bar chart
ax = delivery_site_dpe_counts.plot(kind='bar', stacked=True, figsize=(10, 6))

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Delivery Site')
plt.ylabel('Count')
plt.title('Histogram of Delivery Site by DPE')
plt.xticks(rotation=45, ha='right')
plt.legend(['DPE: no', 'DPE: yes']) # Update legend labels
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Delivery Site by DPE:")
delivery_site_dpe_counts

In [None]:
# Histogram of successes/failures by DPE status

# Group by procedure type and whether it has subsequent anesthesia
dpe_crosstab = pd.crosstab(epidural_df['dpe'], epidural_df['failed_catheter'])

# Create a stacked bar chart
ax = dpe_crosstab.plot(kind='bar', stacked=True, figsize=(6
, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
  width = p.get_width()
  height = p.get_height()
  x, y = p.get_xy()
  ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('DPE Status')
plt.ylabel('Count')
plt.title('Histogram of Successful/Failed')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()


In [None]:
# prompt: do a crosstab histogram of failure versus delivery_site_2188 and dpe

# Assuming 'df' is your DataFrame (as defined in the provided code)

# Create a crosstab for 'delivery_site_2188', 'dpe', and 'failed_catheter'
crosstab_df = pd.crosstab([df['delivery_site_2188'], df['dpe']], df['failed_catheter'])

# Create a stacked bar chart
ax = crosstab_df.plot(kind='bar', stacked=True, figsize=(12, 6))

# Annotate the bars with percentages
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')


plt.xlabel('Delivery Site and DPE')
plt.ylabel('Count')
plt.title('Crosstab Histogram: Failure vs. Delivery Site and DPE')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the crosstab table
print("Crosstab Table:")
crosstab_df

## Scoliosis and back problems

In [None]:
# prompt: create a histogram of the crosstab of has_scoliosis vs failure_rate

# Assuming 'neuraxial_catheter_df' is your DataFrame (as defined in the provided code)

# Group by 'has_scoliosis' and 'failed_catheter'
scoliosis_failure_counts = pd.crosstab(neuraxial_catheter_df['has_scoliosis'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = scoliosis_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Has Scoliosis')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Scoliosis')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Scoliosis', 'Scoliosis'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Scoliosis:")
scoliosis_failure_counts

In [None]:
# prompt: do the same but for has_back_problems

# Group by 'has_back_problems' and 'failed_catheter'
back_problems_failure_counts = pd.crosstab(neuraxial_catheter_df['has_back_problems'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = back_problems_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Has Back Problems')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Back Problems')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Back Problems', 'Back Problems'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Back Problems:")
back_problems_failure_counts

In [None]:
# prompt: do the same but for has_dorsalgia

# Group by 'has_dorsalgia' and 'failed_catheter'
back_pain_failure_counts = pd.crosstab(neuraxial_catheter_df['has_dorsalgia'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = back_pain_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Has Back Pain')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Back Pain')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Back Pain', 'Back Pain'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Back Pain:")
back_pain_failure_counts

## Fetal Presentation

In [None]:
# prompt: do the same histogram, but for fetal_presentation_category vs failure

# Group by 'fetal_presentation_category_2243' and 'failed_catheter'
fetal_presentation_failure_counts = pd.crosstab(neuraxial_catheter_df['fetal_presentation_category_2243'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = fetal_presentation_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Fetal Presentation Category')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Fetal Presentation Category')
plt.xticks(rotation=45, ha='right')
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Fetal Presentation Category:")
fetal_presentation_failure_counts

In [None]:
# prompt: do the same histogram, but for fetal_presentation_position vs failure

# Assuming 'neuraxial_catheter_df' is your DataFrame (as defined in the provided code)

# Group by 'fetal_presentation_position_2247' and 'failed_catheter'
fetal_position_failure_counts = pd.crosstab(neuraxial_catheter_df['fetal_presentation_position_2247'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = fetal_position_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Fetal Presentation Position')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Fetal Presentation Position')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Fetal Presentation Position:")
fetal_position_failure_counts

## Race and SES

In [None]:
# prompt: do the same histogram, but for maternal_race vs failure

# Group by 'maternal_race' and 'failed_catheter'
race_failure_counts = pd.crosstab(neuraxial_catheter_df['maternal_race'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = race_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Maternal Race')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Maternal Race')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Maternal Race:")
race_failure_counts

In [None]:
# prompt: do the same histogram, but for each of these: 32. composite_psychosocial_problems ||| int64
# 33. any_public_insurance ||| int64
# 34. maternal_language_english ||| int64
# 35. marital_status_married_or_partner ||| int64
# 36. country_of_origin_USA ||| int64
# 37. employment_status_fulltime ||| int64

# Assuming 'neuraxial_catheter_df' is your DataFrame

columns_to_analyze = [
    'composite_psychosocial_problems',
    'any_public_insurance',
    'maternal_language_english',
    'marital_status_married_or_partner',
    'country_of_origin_USA',
    'employment_status_fulltime'
]

for column in columns_to_analyze:
  # Group by the current column and 'failed_catheter'
  failure_counts = pd.crosstab(neuraxial_catheter_df[column], neuraxial_catheter_df['failed_catheter'])

  # Create a stacked bar chart
  ax = failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

  # Add percentages within each bar
  for p in ax.patches:
      width = p.get_width()
      height = p.get_height()
      x, y = p.get_xy()
      ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

  plt.xlabel(column)
  plt.ylabel('Count')
  plt.title(f'Histogram of Failure Rate vs. {column}')

  # Customize x-axis ticks and labels (adjust as needed for each column)
  plt.xticks(rotation=0, ha='center')

  plt.legend(['Successful', 'Failed'])
  plt.tight_layout()
  plt.show()

  # Display the table with the same information
  print(f"Table of Failure Rate vs. {column}:")
failure_counts

## Pain

In [None]:
# prompt: do the same histogram but for prior_pain_scores_max

# Assuming 'neuraxial_catheter_df' is your DataFrame

# Group by 'prior_pain_scores_max' and 'failed_catheter'
prior_pain_failure_counts = pd.crosstab(neuraxial_catheter_df['prior_pain_scores_max'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = prior_pain_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Prior Pain Scores Max')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Prior Pain Scores Max')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels if needed
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Prior Pain Scores Max:")
prior_pain_failure_counts

## Gravidity and Parity

In [None]:
# prompt: do the same histogram but for gravidity_2047 and parity_2048

# Assuming 'neuraxial_catheter_df' is your DataFrame

# Group by 'gravidity_2047' and 'failed_catheter'
gravidity_failure_counts = pd.crosstab(neuraxial_catheter_df['gravidity_2047'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = gravidity_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Gravidity')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Gravidity')
plt.xticks(rotation=0)  # Adjust rotation if needed
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table
print("Table of Failure Rate vs. Gravidity:")
print(gravidity_failure_counts)


# Group by 'parity_2048' and 'failed_catheter'
parity_failure_counts = pd.crosstab(neuraxial_catheter_df['parity_2048'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = parity_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Parity')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Parity')
plt.xticks(rotation=0)  # Adjust rotation if needed
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table
print("Table of Failure Rate vs. Parity:")
parity_failure_counts

## Needle Type

In [None]:
# prompt: do the same histogram but for epidural_needle_type

# Assuming 'epidural_df' is your DataFrame (as defined in the provided code)

# Group by 'epidural_needle_type' and 'failed_catheter'
needle_type_failure_counts = pd.crosstab(epidural_df['epidural_needle_type'], epidural_df['failed_catheter'])

# Create a stacked bar chart
ax = needle_type_failure_counts.plot(kind='bar', stacked=True, figsize=(10, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Epidural Needle Type')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Epidural Needle Type')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Epidural Needle Type:")
needle_type_failure_counts

## Paresthesias

In [None]:
# prompt: do the same histogram but for paresthesias_present

# Group by 'paresthesias_present' and 'failed_catheter'
paresthesias_failure_counts = pd.crosstab(neuraxial_catheter_df['paresthesias_present'], neuraxial_catheter_df['failed_catheter'])

# Create a stacked bar chart
ax = paresthesias_failure_counts.plot(kind='bar', stacked=True, figsize=(6, 6), color=['skyblue', 'lightcoral'])

# Add percentages within each bar
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
    ax.annotate(f'{height/sum([p.get_height() for p in ax.patches if p.get_x() == x]) * 100:.1f}%', (x + width/2, y + height/2), ha='center', va='center')

plt.xlabel('Paresthesias Present')
plt.ylabel('Count')
plt.title('Histogram of Failure Rate vs. Paresthesias Present')
plt.xticks(rotation=0, ha='center', ticks=[0, 1], labels=['No Paresthesias', 'Paresthesias'])
plt.legend(['Successful', 'Failed'])
plt.tight_layout()
plt.show()

# Display the table with the same information
print("Table of Failure Rate vs. Paresthesias Present:")
paresthesias_failure_counts

## Number of Attempts

In [None]:
# prompt: create a histogram of the number of attempts. Only show integers on the x-axis

# Assuming 'number_of_neuraxial_attempts' is a column in your DataFrame 'df'
attempts_counts = df['number_of_neuraxial_attempts'].value_counts().sort_index()

plt.figure(figsize=(8, 6))
plt.bar(attempts_counts.index, attempts_counts.values)
plt.xlabel('Number of Attempts')
plt.ylabel('Count')
plt.title('Histogram of Number of Neuraxial Attempts')
plt.xticks(range(int(attempts_counts.index.min()), int(attempts_counts.index.max()) + 1))  # Show only integer ticks on x-axis
plt.tight_layout()
plt.show()


## Loss of Resistance Depth

In [None]:
# prompt: create a histogram of loss of resistance depth. Center the bars over the tick marks and make space between the bars. Bins should be every 0.5

# Assuming 'lor_depth' is a column in your DataFrame 'df'
lor_depths = neuraxial_catheter_df['lor_depth'].dropna()  # Remove NaN values

# Create the histogram with centered bars and spacing
plt.figure(figsize=(8, 6))
plt.hist(lor_depths, bins=np.arange(lor_depths.min(), lor_depths.max() + 0.5, 0.5), rwidth=0.8, align='left')
plt.xlabel('Loss of Resistance Depth')
plt.ylabel('Count')
plt.title('Histogram of Loss of Resistance Depth')
plt.xticks(np.arange(0, lor_depths.max() + 0.5, 1))  # Set x-axis ticks to be at every 1
plt.tight_layout()
plt.show()


In [None]:
# prompt: Plot number of neuraxial attempts vs LOR depth on the x-axis. Add jiggle to both x and y axes

df_plot = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'lor_depth'])

# Add random jiggle to both x and y axes
jiggle_x = np.random.normal(scale = 0.1, size=len(df_plot))
jiggle_y = np.random.normal(scale = 0.1, size=len(df_plot))

plt.figure(figsize=(10, 6))
plt.scatter(df_plot['lor_depth'] + jiggle_x, df_plot['number_of_neuraxial_attempts'] + jiggle_y, alpha=0.5)
plt.xlabel('LOR Depth')
plt.ylabel('Number of Neuraxial Attempts')
plt.title('Number of Neuraxial Attempts vs. LOR Depth with Jiggle')
plt.show()


In [None]:
# prompt: do the same but add shaded error bars for +/- standard error of the mean

# Assuming 'number_of_neuraxial_attempts' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts'])

# Group by number of attempts and calculate the mean and standard error of the mean of failed_catheter
failure_by_attempts = df_plot.groupby('number_of_neuraxial_attempts')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot with error bars
plt.figure(figsize=(10, 6))
plt.plot(failure_by_attempts.index, failure_by_attempts['mean'], marker='o')
plt.fill_between(failure_by_attempts.index,
                 failure_by_attempts['mean'] - failure_by_attempts['sem'],
                 failure_by_attempts['mean'] + failure_by_attempts['sem'],
                 alpha=0.2) # Add shaded error bars
plt.errorbar(failure_by_attempts.index, failure_by_attempts['mean'], yerr=failure_by_attempts['sem'], fmt='o-', capsize=5, elinewidth=1)  # Added error bars
plt.xlabel('Number of Neuraxial Attempts')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Number of Neuraxial Attempts with Error Bars')
plt.xticks(np.arange(0, failure_by_attempts['number_of_neuraxial_attempts'].max() + 0.5, 1))  # Set x-axis ticks to be at every 0.5
plt.grid(True)
plt.show()

In [None]:
# prompt: Plot lor-depth against bmi

# Assuming 'lor_depth' and 'bmi_end_pregnancy_2044' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'bmi_end_pregnancy_2044'])

plt.figure(figsize=(10, 6))
plt.scatter(df_plot['bmi_end_pregnancy_2044'], df_plot['lor_depth'])
plt.xlabel('BMI')
plt.ylabel('LOR Depth')
plt.title('LOR Depth vs. BMI')
plt.show()

In [None]:
from scipy.stats import gaussian_kde

# Extract the data, dropping NaNs
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'bmi_end_pregnancy_2044'])
x = df_plot['bmi_end_pregnancy_2044'].values
y = df_plot['lor_depth'].values

# Perform kernel density estimation
xy = np.vstack([x, y])
kde = gaussian_kde(xy)

# Define grid over data range
xmin, xmax = x.min() - 1, x.max() + 1
ymin, ymax = y.min() - 1, y.max() + 1
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
Z = np.reshape(kde(positions).T, X.shape)

# Create the contour plot
plt.figure(figsize=(10, 6))
plt.contourf(X, Y, Z, levels=15, cmap='viridis')
plt.colorbar(label='Density')
plt.xlabel('BMI')
plt.ylabel('LOR Depth')
plt.title('Contour Plot of LOR Depth vs. BMI (KDE)')
plt.show()


In [None]:
# prompt: do the same but for failure vs loss of resistance depth. Bin the depth by units of 1

# Assuming 'lor_depth' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'failed_catheter'])

# Bin the LOR depth
df_plot['lor_depth_bin'] = (df_plot['lor_depth'] // 1).astype(int)

# Group by the binned LOR depth and calculate the mean of failed_catheter
failure_by_lor_depth = df_plot.groupby('lor_depth_bin')['failed_catheter'].mean()

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_lor_depth.index, failure_by_lor_depth.values, marker='o')
plt.xlabel('Loss of Resistance Depth (binned)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Loss of Resistance Depth (binned by 1)')
plt.xticks(np.arange(0, df_plot['lor_depth'].max() + 0.5, 1))  # Set x-axis ticks to be at every 1
plt.grid(True)
plt.show()

In [None]:
# prompt: Reproduce the same plot, but add shaded error bars for +/- standard error of the mean

# Assuming 'lor_depth' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['lor_depth', 'failed_catheter'])

# Bin the LOR depth
df_plot['lor_depth_bin'] = (df_plot['lor_depth'] // 1).astype(int)

# Group by the binned LOR depth and calculate the mean and standard error of the mean of failed_catheter
failure_by_lor_depth = df_plot.groupby('lor_depth_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_lor_depth.index, failure_by_lor_depth['mean'], marker='o')
plt.fill_between(failure_by_lor_depth.index,
                 failure_by_lor_depth['mean'] - failure_by_lor_depth['sem'],
                 failure_by_lor_depth['mean'] + failure_by_lor_depth['sem'],
                 alpha=0.5) # Add shaded error bars

plt.xlabel('Loss of Resistance Depth (binned)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Loss of Resistance Depth (binned by 1) with Error Bars')
plt.xticks(np.arange(0, df_plot['lor_depth'].max() + 0.5, 1))  # Set x-axis ticks to be at every 1
plt.grid(True)
plt.show()

## BMI / height / weight

In [None]:
# prompt: plot bmi end pregnancy against failure rate using binning as above.

# Assuming 'bmi_end_pregnancy' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['bmi_end_pregnancy_2044', 'failed_catheter'])

# Bin the bmi_end_pregnancy
df_plot['bmi_end_pregnancy_bin'] = (df_plot['bmi_end_pregnancy_2044'] // 1).astype(int)

# Group by the binned bmi_end_pregnancy and calculate the mean and standard error of the mean of failed_catheter
failure_by_bmi = df_plot.groupby('bmi_end_pregnancy_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_bmi.index, failure_by_bmi['mean'], marker='o')
plt.fill_between(failure_by_bmi.index,
                 failure_by_bmi['mean'] - failure_by_bmi['sem'],
                 failure_by_bmi['mean'] + failure_by_bmi['sem'],
                 alpha=0.5) # Add shaded error bars

plt.xlabel('BMI (kg/m^2) at End of Pregnancy (binned by 1)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. BMI at End of Pregnancy (binned by 1) with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: # prompt: plot weight end pregnancy against failure rate using binning as above.

# Assuming 'maternal_weight_end_pregnancy_2045' and 'failed_catheter' are columns in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['maternal_weight_end_pregnancy_2045', 'failed_catheter'])

# Bin the maternal weight at the end of pregnancy
df_plot['weight_end_pregnancy_bin'] = (df_plot['maternal_weight_end_pregnancy_2045'] // 10).astype(int) * 10

# Group by the binned weight and calculate the mean and standard error of the mean of failed_catheter
failure_by_weight = df_plot.groupby('weight_end_pregnancy_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_weight.index, failure_by_weight['mean'], marker='o')
plt.fill_between(failure_by_weight.index,
                 failure_by_weight['mean'] - failure_by_weight['sem'],
                 failure_by_weight['mean'] + failure_by_weight['sem'],
                 alpha=0.5)  # Add shaded error bars

plt.xlabel('Maternal Weight (kg) at End of Pregnancy (binned by 10)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Maternal Weight at End of Pregnancy (binned by 10) with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: do the same but for height

# Assuming 'height' is a column in your DataFrame 'df'
df_plot = neuraxial_catheter_df.dropna(subset=['maternal_height_2046', 'failed_catheter'])

# Drop heights greater than 250
df_plot = df_plot[df_plot['maternal_height_2046'] <= 250]

# Bin the height
df_plot['height_bin'] = (df_plot['maternal_height_2046'] // 1).astype(int)

# Group by the binned height and calculate the mean and standard error of the mean of failed_catheter
failure_by_height = df_plot.groupby('height_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_height.index, failure_by_height['mean'], marker='o')
plt.fill_between(failure_by_height.index,
                 failure_by_height['mean'] - failure_by_height['sem'],
                 failure_by_height['mean'] + failure_by_height['sem'],
                 alpha=0.5) # Add shaded error bars

plt.xlabel('Height (binned by 1)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Height (binned by 1) with Error Bars')
plt.grid(True)
plt.show()

## Gestational Age and Weight

In [None]:
# prompt: do the same but for gestational age

# Histogram of gestational age
plt.figure(figsize=(10, 6))
plt.hist(df['gestational_age_2052'].dropna(), bins=20) # Adjust bins as needed
plt.xlabel('Gestational Age (days)')
plt.ylabel('Count')
plt.title('Distribution of Gestational Age')
plt.show()

# Analyze gestational age in relation to failed catheter
df_plot = neuraxial_catheter_df.dropna(subset=['gestational_age_2052', 'failed_catheter'])
df_plot['gestational_age_bin'] = (df_plot['gestational_age_2052'] // 7).astype(int) * 7
failure_by_gestational_age = df_plot.groupby('gestational_age_bin')['failed_catheter'].agg(['mean', 'sem'])

plt.figure(figsize=(10, 6))
plt.plot(failure_by_gestational_age.index, failure_by_gestational_age['mean'], marker='o')
plt.fill_between(failure_by_gestational_age.index,
                failure_by_gestational_age['mean'] - failure_by_gestational_age['sem'],
                failure_by_gestational_age['mean'] + failure_by_gestational_age['sem'],
                alpha=0.5)
plt.xlabel('Gestational Age (days) (binned by 7)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Gestational Age (binned by 7) with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: do the same histogram and binned failure rate but for baby_weight_2196

# Assuming 'baby_weight_2196' is a column in your DataFrame 'df' or 'neuraxial_catheter_df'
plt.figure(figsize=(10, 6))
plt.hist(neuraxial_catheter_df['baby_weight_2196'].dropna(), bins=20)  # Adjust bins as needed
plt.xlabel('Baby Weight (kg)')
plt.ylabel('Count')
plt.title('Histogram of Baby Weight')
plt.show()

# Assuming 'neuraxial_catheter_df' is your DataFrame

df_plot = neuraxial_catheter_df.dropna(subset=['baby_weight_2196', 'failed_catheter'])

# Bin the baby weight
df_plot['baby_weight_bin'] = (df_plot['baby_weight_2196'] // 0.5) * 0.5

# Group by the binned baby weight and calculate the mean and standard error of the mean of failed_catheter
failure_by_baby_weight = df_plot.groupby('baby_weight_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_baby_weight.index, failure_by_baby_weight['mean'], marker='o')
plt.fill_between(failure_by_baby_weight.index,
                 failure_by_baby_weight['mean'] - failure_by_baby_weight['sem'],
                 failure_by_baby_weight['mean'] + failure_by_baby_weight['sem'],
                 alpha=0.5)  # Add shaded error bars

plt.xlabel('Baby Weight (kg) (binned by 0.5)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Baby Weight with Error Bars')
plt.grid(True)
plt.show()

In [None]:
# prompt: do the same count histogram but for secs_rom_thru_delivery_2197

# Assuming 'neuraxial_catheter_df' is your DataFrame

# Drop NaN values in 'secs_rom_thru_delivery_2197'
df_plot = neuraxial_catheter_df.dropna(subset=['rom_thru_delivery_hours'])

# Create the histogram
plt.figure(figsize=(10, 6))
plt.hist(df_plot['rom_thru_delivery_hours'], bins=200)  # Adjust bins as needed
plt.xlabel('Hours from ROM to Delivery')
plt.xlim(0,100)
plt.ylabel('Count')
plt.title('Histogram of Hours from ROM to Delivery')
plt.show()

In [None]:
# prompt: do the same binned plot for rom_thru_delivery_hours

# Assuming 'neuraxial_catheter_df' is your DataFrame

df_plot = neuraxial_catheter_df.dropna(subset=['rom_thru_delivery_hours', 'failed_catheter'])

# Bin the rom_thru_delivery_hours
df_plot['rom_thru_delivery_hours_bin'] = (df_plot['rom_thru_delivery_hours'] // 1).astype(int)

# Group by the binned rom_thru_delivery_hours and calculate the mean and standard error of the mean of failed_catheter
failure_by_rom_delivery = df_plot.groupby('rom_thru_delivery_hours_bin')['failed_catheter'].agg(['mean', 'sem'])

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(failure_by_rom_delivery.index, failure_by_rom_delivery['mean'], marker='o')
plt.fill_between(failure_by_rom_delivery.index,
                 failure_by_rom_delivery['mean'] - failure_by_rom_delivery['sem'],
                 failure_by_rom_delivery['mean'] + failure_by_rom_delivery['sem'],
                 alpha=0.5)  # Add shaded error bars
plt.xlim(0,100)
plt.xlabel('Hours from ROM to Delivery (binned)')
plt.ylabel('Average Failure Rate')
plt.title('Failure Rate vs. Hours from ROM to Delivery (binned by 1) with Error Bars')
plt.grid(True)
plt.show()

# Statistical Analysis

In [None]:
import statsmodels.formula.api as smf

df_corr = neuraxial_catheter_df.dropna(subset=['lor_depth', 'number_of_neuraxial_attempts'])

# Fit the model using the formula
model = smf.ols('number_of_neuraxial_attempts ~ lor_depth', data=df_corr).fit()

# Print the summary of the regression results
print(model.summary())


In [None]:
# For categorical variables like DPE and failed_catheter
from scipy.stats import chi2_contingency

dpe_crosstab = pd.crosstab(epidural_df['DPE'], epidural_df['failed_catheter'])
chi2, p, _, _ = chi2_contingency(dpe_crosstab)

print(dpe_crosstab.div(dpe_crosstab.sum(axis=1), axis=0) * 100)
print("Chi-squared statistic:", chi2)
print("P-value:", p)

In [28]:
# prompt: Do univariate logistic regression separately using number of attempts and loss of resistance depth to predict failure

import statsmodels.api as sm
import statsmodels.formula.api as smf

# # Prepare the data for logistic regression with number of attempts as the predictor
# df_logreg_attempts = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'failed_catheter'])
# # Fit the logistic regression model
# model_attempts = smf.logit('failed_catheter ~ number_of_neuraxial_attempts', data=df_logreg_attempts).fit()

# # Print the summary of the model
# print(model_attempts.summary())


# Prepare the data for logistic regression with loss of resistance depth as the predictor
df_logreg_lor = neuraxial_catheter_df.dropna(subset=['lor_depth', 'failed_catheter'])
# Fit the logistic regression model
model_lor = smf.logit('failed_catheter ~ lor_depth', data=df_logreg_lor).fit()

# Print the summary of the model
print(model_lor.summary())


Optimization terminated successfully.
         Current function value: 0.219833
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:        failed_catheter   No. Observations:               103125
Model:                          Logit   Df Residuals:                   103123
Method:                           MLE   Df Model:                            1
Date:                Fri, 10 Jan 2025   Pseudo R-squ.:                0.004623
Time:                        16:22:31   Log-Likelihood:                -22670.
converged:                       True   LL-Null:                       -22776.
Covariance Type:            nonrobust   LLR p-value:                 1.032e-47
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.6014      0.057    -63.100      0.000      -3.713      -3.490
lor_depth      0.1439      0.

In [None]:
# prompt: Now do multivariate analysis using the same two predictors

# Prepare the data for logistic regression with both predictors
df_logreg_multi = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'LOR_depth', 'failed_catheter'])

# Fit the logistic regression model with both predictors
model_multi = smf.logit('failed_catheter ~ number_of_neuraxial_attempts + LOR_depth', data=df_logreg_multi).fit()

# Print the summary of the model
print(model_multi.summary())


In [17]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

def all_regressions_single_plot(df, outcome_col='failed_catheter'):
    """
    1) For each column in df (except the outcome_col and unsupported dtypes),
       fit a logistic regression of outcome_col ~ predictor.
    2) Extract the predictor's coefficient and p-value.
    3) Combine into a single DataFrame and plot x=coefficient, y=-log10(p-value).
    """
    
    # Safety check: ensure the outcome is binary (0/1)
    # If it's not, you may need:
    # df[outcome_col] = df[outcome_col].map({False:0, True:1})

    results = []

    for col in df.columns:
        # Skip the outcome column itself
        if col == outcome_col:
            continue
        
        # Skip datetime or other unsupported dtypes
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            continue
        
        # Drop rows that are NaN in either outcome or predictor (to avoid fitting errors)
        temp_df = df[[outcome_col, col]].dropna()

        # If the column is all one value or empty, skip
        if temp_df[col].nunique() < 2:
            continue

        # Build the formula
        # For categorical columns, wrap in C()
        if pd.api.types.is_numeric_dtype(temp_df[col]):
            formula = f"{outcome_col} ~ {col}"
        else:
            formula = f"{outcome_col} ~ C({col})"

        # Fit the logistic regression
        try:
            model = smf.logit(formula, data=temp_df).fit(disp=False)
        except Exception as e:
            print(f"Skipping column {col} due to fitting error: {e}")
            continue
        
        # model.params and model.pvalues each contain:
        #   Index 0 = Intercept
        #   Index 1,2,... = Coefficients for the variable(s)
        # For numeric or 2-level categorical, there's only 1 predictor coefficient.
        # For multi-level categorical, there will be multiple dummy variables.
        
        # We'll capture only the *first* coefficient for the predictor (index 1).
        if len(model.params) < 2:
            # Means no separate predictor param was generated
            continue
        
        # The predictor is at index 1 in model.params
        coef = model.params[1]
        pval = model.pvalues[1]
        
        results.append({
            'column': col,
            'coef': coef,
            'pval': pval
        })

    # Create a results DataFrame
    results_df = pd.DataFrame(results).sort_values('pval')

    # Plot: x=coefficient, y=-log10(p-value)
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # We add a small offset so we don't try to take log10 of 0
    # In case any p-values are extremely small or exactly 0
    offset = 1e-300
    x_vals = results_df['coef']
    y_vals = -np.log10(results_df['pval'] + offset)
    
    sc = ax.scatter(x_vals, y_vals, color='blue')
    
    # Annotate each point with the column name
    for i, row in results_df.iterrows():
        ax.text(row['coef'], -np.log10(row['pval'] + offset), row['column'],
                fontsize=8, ha='left', va='bottom')

    ax.axhline(-np.log10(0.05), color='red', linestyle='--', label='p=0.05')
    ax.set_xlabel('Coefficient')
    ax.set_ylabel('-log10(p-value)')
    ax.set_title(f'Logistic Regressions for {outcome_col} ~ Each Predictor')
    ax.legend()
    plt.tight_layout()
    plt.show()
    
    return results_df

# Usage:
results_df = all_regressions_single_plot(neuraxial_catheter_df, 'failed_catheter')
# results_df will have columns [column, coef, pval].
# The plot shows each predictor's slope vs. its -log10(p-value).


Skipping column gestational_age_2052 due to fitting error: endog has evaluated to an array with multiple columns that has shape (107512, 2). This occurs when the variable converted to endog is non-numeric (e.g., bool or str).
Skipping column delivery_site_2188 due to fitting error: endog has evaluated to an array with multiple columns that has shape (107523, 2). This occurs when the variable converted to endog is non-numeric (e.g., bool or str).
Skipping column baby_weight_2196 due to fitting error: endog has evaluated to an array with multiple columns that has shape (106766, 2). This occurs when the variable converted to endog is non-numeric (e.g., bool or str).
Skipping column rom_thru_delivery_hours due to fitting error: endog has evaluated to an array with multiple columns that has shape (101989, 2). This occurs when the variable converted to endog is non-numeric (e.g., bool or str).
Skipping column fetal_presentation_category_2243 due to fitting error: endog has evaluated to an ar

KeyError: 'pval'

# Logistic Regression Model

In [None]:
# Filter the DataFrame to include only neuraxial catheter (ie, epidural + CSE + intrathecal) or epidural-only catheter procedures
neuraxial_catheter_df = df[df['is_neuraxial_catheter'] == 1]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

# Load the dataset
data = neuraxial_catheter_df

# Drop columns with more than 80% missing values
threshold = len(data) * 0.5
data_cleaned = data.dropna(thresh=threshold, axis=1)

# Drop rows where target variable is missing
data_cleaned = data_cleaned.dropna(subset=["failed_catheter"])

# Separate features and target variable
X = data_cleaned.drop(columns=["failed_catheter", "best_timestamp"], errors='ignore')
y = data_cleaned["failed_catheter"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                           ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Train logistic regression with class weights
logistic_model = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced', n_jobs=1)
logistic_model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = logistic_model.predict(X_test_preprocessed)
y_pred_prob = logistic_model.predict_proba(X_test_preprocessed)[:, 1]

# Evaluate the model
evaluation_metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_pred_prob),
    "classification_report": classification_report(y_test, y_pred)
}

# Print evaluation metrics
print("Model Evaluation:")
for metric, value in evaluation_metrics.items():
    if metric == "classification_report":
        print("\nClassification Report:\n", value)
    else:
        print(f"{metric.capitalize()}: {value:.4f}")
