# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import modules.data_analysis_utils as dau
import modules.data_visualization_utils as dvu
from importlib import reload

# my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

# Load data

In [None]:
df = pd.read_csv(my_computer_fpath + 'processed_and_imputed_merlin_data.csv') 

## Prepend '#' for better dummies

In [None]:
reload(dau)

In [None]:
df['true_procedure_type_incl_dpe'].describe()

In [None]:
chosen_default_categories = {
    'true_procedure_type_incl_dpe': 'epidural',
    'anesthesiologist_experience_category': 'moderate',
    'resident_experience_category': 'no_resident',
}

In [None]:
df = dau.prepend_char(df, '#', chosen_default_categories=chosen_default_categories, cols_to_ignore=['anes_procedure_encounter_id_2273','unique_pt_id'])

In [None]:
neuraxial_catheter_df = df.copy()

# Statistical Analysis

## Some individually interesting regressions

In [None]:
df_corr = neuraxial_catheter_df.dropna(subset=['lor_depth', 'number_of_neuraxial_attempts'])

# Fit the model using the formula
model = smf.ols('number_of_neuraxial_attempts ~ lor_depth', data=df_corr).fit()

# Print the summary of the regression results
print(model.summary())


In [None]:
# prompt: Do univariate logistic regression separately using number of attempts and loss of resistance depth to predict failure

import statsmodels.api as sm
import statsmodels.formula.api as smf

# Prepare the data for logistic regression with number of attempts as the predictor
df_logreg_attempts = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'failed_catheter'])
# Fit the logistic regression model
model_attempts = smf.logit('failed_catheter ~ number_of_neuraxial_attempts', data=df_logreg_attempts).fit()

# Print the summary of the model
print(model_attempts.summary())


# Prepare the data for logistic regression with loss of resistance depth as the predictor
df_logreg_lor = neuraxial_catheter_df.dropna(subset=['lor_depth', 'failed_catheter'])
# Fit the logistic regression model
model_lor = smf.logit('failed_catheter ~ lor_depth', data=df_logreg_lor).fit()

# Print the summary of the model
print(model_lor.summary())


In [None]:
# prompt: Now do multivariate analysis using the same two predictors

# Prepare the data for logistic regression with both predictors
df_logreg_multi = neuraxial_catheter_df.dropna(subset=['number_of_neuraxial_attempts', 'lor_depth', 'failed_catheter'])

# Fit the logistic regression model with both predictors
model_multi = smf.logit('failed_catheter ~ number_of_neuraxial_attempts + lor_depth', data=df_logreg_multi).fit()

# Print the summary of the model
print(model_multi.summary())


In [None]:
# Prepare the data for logistic regression with prior_failed_catheters_this_enc as the predictor
df_logreg_prior_failed = neuraxial_catheter_df.dropna(subset=['prior_failed_catheters_this_enc', 'failed_catheter'])

# Fit the logistic regression model
model_attempts = smf.logit('failed_catheter ~ prior_failed_catheters_this_enc', data=df_logreg_prior_failed).fit()

# Print the summary of the model
print(model_attempts.summary())

## All univariate regressions

In [None]:
results_df = dau.all_regressions_each_dummy(neuraxial_catheter_df, 'failed_catheter')
# This returns a DataFrame with columns: [column, param_name, coef, pval].
# Each level of a categorical predictor will appear as a separate row.

In [None]:
results_df['category_variable'] = results_df['param_name'].apply(dau.parse_param_name)

In [None]:
results_df[['column','category_variable','coef','pval']]

In [None]:
total_num_variables = results_df.shape[0]
alpha = 0.05 / total_num_variables
significant_variables = results_df[results_df['pval'] < alpha].shape[0]
text = f'Of {total_num_variables} total variables, {significant_variables} were significant at a \nBonferroni-corrected alpha = {alpha}'
dau.show_text(text)


In [None]:
reload(dau)

In [None]:
dau.plot_coef_vs_pval(results_df.loc[results_df['column']!='placement_to_delivery_hours',:])

# Scale numerical values

In [None]:
for col in ['bmi_end_pregnancy_2044', 'baby_weight_2196', 'gestational_age_weeks', 'maternal_age_years']:
    neuraxial_catheter_df[col] = neuraxial_catheter_df[col] - neuraxial_catheter_df[col].median()

In [None]:
neuraxial_catheter_df['gestational_age_weeks'].describe()

# Reduce colinear variables

In [None]:
dvu.plot_correlation_heatmap_with_related_groups(neuraxial_catheter_df, drop_columns=['anes_procedure_encounter_id_2273','unique_pt_id'],additional_groups='preset',draw_group_boxes=True,draw_group_lines=True)

In [None]:
neuraxial_catheter_df_reduced = dau.reduce_cols(neuraxial_catheter_df)

In [None]:
dvu.plot_correlation_heatmap_with_related_groups(neuraxial_catheter_df_reduced, drop_columns=['anes_procedure_encounter_id_2273','unique_pt_id'],additional_groups='preset',draw_group_boxes=True,draw_group_lines=True)

# Logistic Regression Model

## Random data for model comparison

In [None]:
test_dataset = neuraxial_catheter_df_reduced.copy()
failure_rate = test_dataset['failed_catheter'].mean()
test_dataset['failed_catheter'] = np.random.binomial(n=1, p=failure_rate, size=len(test_dataset))

In [None]:
X_train, X_test, y_train, y_test = dau.preprocess_data(data=test_dataset)
dau.do_logistic_regression(X_train, X_test, y_train, y_test)

## Real LOGIT regression

In [None]:
X_train, X_test, y_train, y_test = dau.preprocess_data_for_statsmodels(data=neuraxial_catheter_df_reduced.copy())
# If I try to use the general preprocess_data function, the variable names get screwed up
# and so does the results_summ
# So I use this custom one instead that uses pd.get_dummies instead of OneHotEncoder
# I don't know exactly why it needs to be this way
y_pred_prob, result_summ, evaluation_metrics, evaluation_metrics_by_threshold = dau.do_logistic_regression_with_statsmodels(X_train, X_test, y_train, y_test)

In [None]:
evaluation_metrics_by_threshold = pd.DataFrame(evaluation_metrics_by_threshold)
evaluation_metrics_by_threshold

In [None]:
dvu.plot_roc_curve(y_test,y_pred_prob, evaluation_metrics['roc_auc'])

In [None]:
reload(dau)

In [None]:
logit_result_table, logit_predictor_table = dau.prepare_logit_tables(result_summ)

In [None]:
logit_result_table

In [None]:
logit_predictor_table

In [None]:
patient_df, procedural_df = dau.divide_and_rename_patient_and_procedural_factors(logit_predictor_table)

In [None]:
dvu.show_forest_plots(patient_df, procedural_df)

## SKLearn Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = dau.preprocess_data(data=neuraxial_catheter_df_reduced.copy())
logistic_model = dau.do_logistic_regression(X_train, X_test, y_train, y_test)
dau.print_sklearn_coefficients(feature_names=X_train.columns, coefficients=logistic_model.coef_[0])

# Propensity Scores

## Propensity Scoring for DPE

In [None]:
import pandas as pd
import numpy as np

# For logistic regression and nearest neighbor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# For imputation and scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# For statistical inference (CIs, p-values)
import statsmodels.api as sm

# ------------------------------------------------------------------------------
# 1. Copy your original dataframe
# ------------------------------------------------------------------------------
df = neuraxial_catheter_df.copy()
df['dpe'] = (df['true_procedure_type_incl_dpe'] == 'dpe').astype(int)
df.drop(columns=['true_procedure_type_incl_dpe'], inplace=True)

# Columns for the treatment and outcome
treatment_col = 'dpe'
outcome_col   = 'failed_catheter'

# ------------------------------------------------------------------------------
# 2. Identify numeric vs. categorical columns (excluding treatment & outcome)
# ------------------------------------------------------------------------------
# If 'dpe' or 'failed_catheter' happen to be numeric, we still exclude them from imputation.
numeric_cols = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in [treatment_col, outcome_col]
]
categorical_cols = [
    col for col in df.columns
    if col not in numeric_cols and col not in [treatment_col, outcome_col]
]

# ------------------------------------------------------------------------------
# 3. Impute missing data
#    - Median for numeric
#    - Most frequent for categorical
# ------------------------------------------------------------------------------
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit/transform numeric columns
df_num = pd.DataFrame(
    num_imputer.fit_transform(df[numeric_cols]),
    columns=numeric_cols
)

# Fit/transform categorical columns
df_cat = pd.DataFrame(
    cat_imputer.fit_transform(df[categorical_cols]),
    columns=categorical_cols
)

# ------------------------------------------------------------------------------
# 4. One-hot encode (dummy) the categorical columns
# ------------------------------------------------------------------------------
df_cat_encoded = pd.get_dummies(df_cat, drop_first=True)

# ------------------------------------------------------------------------------
# 5. Combine imputed numeric + encoded categorical with original treatment/outcome
# ------------------------------------------------------------------------------
# Reattach treatment/outcome columns to the front, for convenience
df_imputed = pd.concat(
    [
        df[[treatment_col, outcome_col]].reset_index(drop=True),
        df_num.reset_index(drop=True),
        df_cat_encoded.reset_index(drop=True)
    ],
    axis=1
)

# ------------------------------------------------------------------------------
# 6. Standardize numeric features (optional but often recommended)
#    Identify which columns in df_num still exist in df_imputed
# ------------------------------------------------------------------------------
scaler = StandardScaler()
df_num_scaled = pd.DataFrame(
    scaler.fit_transform(df_imputed[numeric_cols]),
    columns=numeric_cols
)

# Now replace the unscaled numeric columns in df_imputed
for col in numeric_cols:
    df_imputed[col] = df_num_scaled[col]

# ------------------------------------------------------------------------------
# 7. Fit the propensity model (LogisticRegression) on all columns except
#    the treatment and outcome columns.
# ------------------------------------------------------------------------------
feature_cols = [c for c in df_imputed.columns if c not in [treatment_col, outcome_col]]

X = df_imputed[feature_cols].values  # all imputed & encoded features
y = df_imputed[treatment_col].values # the treatment indicator (dpe)

propensity_model = LogisticRegression(solver='lbfgs', max_iter=1000)
propensity_model.fit(X, y)

# Probability of dpe=1
propensity_scores = propensity_model.predict_proba(X)[:, 1]
df_imputed['propensity_score'] = propensity_scores

# ------------------------------------------------------------------------------
# 8. Separate treated vs. control and do nearest-neighbor matching
# ------------------------------------------------------------------------------
treated = df_imputed[df_imputed[treatment_col] == 1].copy()
control = df_imputed[df_imputed[treatment_col] == 0].copy()

treated_scores = treated[['propensity_score']].values
control_scores = control[['propensity_score']].values

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(control_scores)

distances, indices = nn.kneighbors(treated_scores)
distances = distances.flatten()
indices = indices.flatten()

matched_treated = treated.copy()
matched_control = control.iloc[indices].copy()

# Combine matched sample
matched_data = pd.concat([matched_treated, matched_control], axis=0).reset_index(drop=True)

# ------------------------------------------------------------------------------
# 9. Fit an outcome model on the matched sample
#    We'll use statsmodels for confidence intervals and p-values.
# ------------------------------------------------------------------------------
matched_data['intercept'] = 1.0

# We'll just use dpe (and intercept) in the outcome model here
X_outcome = matched_data[['intercept', treatment_col]]
y_outcome = matched_data[outcome_col]

logit_sm = sm.Logit(y_outcome, X_outcome)
result_sm = logit_sm.fit(disp=0)  # disp=0 hides optimization output

print(result_sm.summary())

# Extract OR & 95% CI
params = result_sm.params
conf = result_sm.conf_int()
odds_ratios = np.exp(params)
conf_odds = np.exp(conf)

print("\nOdds Ratios:\n", odds_ratios)
print("\n95% Confidence Intervals:\n", conf_odds)


## Propensity Scoring for CSE

In [None]:
import pandas as pd
import numpy as np

# For logistic regression and nearest neighbor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# For imputation and scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# For statistical inference (CIs, p-values)
import statsmodels.api as sm

# ------------------------------------------------------------------------------
# 1. Copy your original dataframe
# ------------------------------------------------------------------------------
df = neuraxial_catheter_df.copy()
df['cse'] = (df['true_procedure_type_incl_dpe'] == 'cse').astype(int)
df.drop(columns=['true_procedure_type_incl_dpe'], inplace=True)

# Columns for the treatment and outcome
treatment_col = 'cse'
outcome_col   = 'failed_catheter'

# ------------------------------------------------------------------------------
# 2. Identify numeric vs. categorical columns (excluding treatment & outcome)
# ------------------------------------------------------------------------------
# If 'dpe' or 'failed_catheter' happen to be numeric, we still exclude them from imputation.
numeric_cols = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in [treatment_col, outcome_col]
]
categorical_cols = [
    col for col in df.columns
    if col not in numeric_cols and col not in [treatment_col, outcome_col]
]

# ------------------------------------------------------------------------------
# 3. Impute missing data
#    - Median for numeric
#    - Most frequent for categorical
# ------------------------------------------------------------------------------
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit/transform numeric columns
df_num = pd.DataFrame(
    num_imputer.fit_transform(df[numeric_cols]),
    columns=numeric_cols
)

# Fit/transform categorical columns
df_cat = pd.DataFrame(
    cat_imputer.fit_transform(df[categorical_cols]),
    columns=categorical_cols
)

# ------------------------------------------------------------------------------
# 4. One-hot encode (dummy) the categorical columns
# ------------------------------------------------------------------------------
df_cat_encoded = pd.get_dummies(df_cat, drop_first=True)

# ------------------------------------------------------------------------------
# 5. Combine imputed numeric + encoded categorical with original treatment/outcome
# ------------------------------------------------------------------------------
# Reattach treatment/outcome columns to the front, for convenience
df_imputed = pd.concat(
    [
        df[[treatment_col, outcome_col]].reset_index(drop=True),
        df_num.reset_index(drop=True),
        df_cat_encoded.reset_index(drop=True)
    ],
    axis=1
)

# ------------------------------------------------------------------------------
# 6. Standardize numeric features (optional but often recommended)
#    Identify which columns in df_num still exist in df_imputed
# ------------------------------------------------------------------------------
scaler = StandardScaler()
df_num_scaled = pd.DataFrame(
    scaler.fit_transform(df_imputed[numeric_cols]),
    columns=numeric_cols
)

# Now replace the unscaled numeric columns in df_imputed
for col in numeric_cols:
    df_imputed[col] = df_num_scaled[col]

# ------------------------------------------------------------------------------
# 7. Fit the propensity model (LogisticRegression) on all columns except
#    the treatment and outcome columns.
# ------------------------------------------------------------------------------
feature_cols = [c for c in df_imputed.columns if c not in [treatment_col, outcome_col]]

X = df_imputed[feature_cols].values  # all imputed & encoded features
y = df_imputed[treatment_col].values # the treatment indicator (dpe)

propensity_model = LogisticRegression(solver='lbfgs', max_iter=1000)
propensity_model.fit(X, y)

# Probability of dpe=1
propensity_scores = propensity_model.predict_proba(X)[:, 1]
df_imputed['propensity_score'] = propensity_scores

# ------------------------------------------------------------------------------
# 8. Separate treated vs. control and do nearest-neighbor matching
# ------------------------------------------------------------------------------
treated = df_imputed[df_imputed[treatment_col] == 1].copy()
control = df_imputed[df_imputed[treatment_col] == 0].copy()

treated_scores = treated[['propensity_score']].values
control_scores = control[['propensity_score']].values

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(control_scores)

distances, indices = nn.kneighbors(treated_scores)
distances = distances.flatten()
indices = indices.flatten()

matched_treated = treated.copy()
matched_control = control.iloc[indices].copy()

# Combine matched sample
matched_data = pd.concat([matched_treated, matched_control], axis=0).reset_index(drop=True)

# ------------------------------------------------------------------------------
# 9. Fit an outcome model on the matched sample
#    We'll use statsmodels for confidence intervals and p-values.
# ------------------------------------------------------------------------------
matched_data['intercept'] = 1.0

# We'll just use dpe (and intercept) in the outcome model here
X_outcome = matched_data[['intercept', treatment_col]]
y_outcome = matched_data[outcome_col]

logit_sm = sm.Logit(y_outcome, X_outcome)
result_sm = logit_sm.fit(disp=0)  # disp=0 hides optimization output

print(result_sm.summary())

# Extract OR & 95% CI
params = result_sm.params
conf = result_sm.conf_int()
odds_ratios = np.exp(params)
conf_odds = np.exp(conf)

print("\nOdds Ratios:\n", odds_ratios)
print("\n95% Confidence Intervals:\n", conf_odds)

del df

## Propensity scoring for DPE vs CSE

In [None]:
import pandas as pd
import numpy as np

# For logistic regression and nearest neighbor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# For imputation and scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# For statistical inference (CIs, p-values)
import statsmodels.api as sm

# ------------------------------------------------------------------------------
# 1. Copy your original dataframe
# ------------------------------------------------------------------------------
df = neuraxial_catheter_df.copy()
df = df[df['true_procedure_type_incl_dpe'].isin(['cse', 'dpe'])]
df['cse_not_dpe'] = (df['true_procedure_type_incl_dpe'] == 'cse').astype(int)
df.drop(columns=['true_procedure_type_incl_dpe'], inplace=True)

# Columns for the treatment and outcome
treatment_col = 'cse_not_dpe'
outcome_col   = 'failed_catheter'

# ------------------------------------------------------------------------------
# 2. Identify numeric vs. categorical columns (excluding treatment & outcome)
# ------------------------------------------------------------------------------
# If 'dpe' or 'failed_catheter' happen to be numeric, we still exclude them from imputation.
numeric_cols = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in [treatment_col, outcome_col]
]
categorical_cols = [
    col for col in df.columns
    if col not in numeric_cols and col not in [treatment_col, outcome_col]
]

# ------------------------------------------------------------------------------
# 3. Impute missing data
#    - Median for numeric
#    - Most frequent for categorical
# ------------------------------------------------------------------------------
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit/transform numeric columns
df_num = pd.DataFrame(
    num_imputer.fit_transform(df[numeric_cols]),
    columns=numeric_cols
)

# Fit/transform categorical columns
df_cat = pd.DataFrame(
    cat_imputer.fit_transform(df[categorical_cols]),
    columns=categorical_cols
)

# ------------------------------------------------------------------------------
# 4. One-hot encode (dummy) the categorical columns
# ------------------------------------------------------------------------------
df_cat_encoded = pd.get_dummies(df_cat, drop_first=True)

# ------------------------------------------------------------------------------
# 5. Combine imputed numeric + encoded categorical with original treatment/outcome
# ------------------------------------------------------------------------------
# Reattach treatment/outcome columns to the front, for convenience
df_imputed = pd.concat(
    [
        df[[treatment_col, outcome_col]].reset_index(drop=True),
        df_num.reset_index(drop=True),
        df_cat_encoded.reset_index(drop=True)
    ],
    axis=1
)

# ------------------------------------------------------------------------------
# 6. Standardize numeric features (optional but often recommended)
#    Identify which columns in df_num still exist in df_imputed
# ------------------------------------------------------------------------------
scaler = StandardScaler()
df_num_scaled = pd.DataFrame(
    scaler.fit_transform(df_imputed[numeric_cols]),
    columns=numeric_cols
)

# Now replace the unscaled numeric columns in df_imputed
for col in numeric_cols:
    df_imputed[col] = df_num_scaled[col]

# ------------------------------------------------------------------------------
# 7. Fit the propensity model (LogisticRegression) on all columns except
#    the treatment and outcome columns.
# ------------------------------------------------------------------------------
feature_cols = [c for c in df_imputed.columns if c not in [treatment_col, outcome_col]]

X = df_imputed[feature_cols].values  # all imputed & encoded features
y = df_imputed[treatment_col].values # the treatment indicator (dpe)

propensity_model = LogisticRegression(solver='lbfgs', max_iter=1000)
propensity_model.fit(X, y)

# Probability of dpe=1
propensity_scores = propensity_model.predict_proba(X)[:, 1]
df_imputed['propensity_score'] = propensity_scores

# ------------------------------------------------------------------------------
# 8. Separate treated vs. control and do nearest-neighbor matching
# ------------------------------------------------------------------------------
treated = df_imputed[df_imputed[treatment_col] == 1].copy()
control = df_imputed[df_imputed[treatment_col] == 0].copy()

treated_scores = treated[['propensity_score']].values
control_scores = control[['propensity_score']].values

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(control_scores)

distances, indices = nn.kneighbors(treated_scores)
distances = distances.flatten()
indices = indices.flatten()

matched_treated = treated.copy()
matched_control = control.iloc[indices].copy()

# Combine matched sample
matched_data = pd.concat([matched_treated, matched_control], axis=0).reset_index(drop=True)

# ------------------------------------------------------------------------------
# 9. Fit an outcome model on the matched sample
#    We'll use statsmodels for confidence intervals and p-values.
# ------------------------------------------------------------------------------
matched_data['intercept'] = 1.0

# We'll just use dpe (and intercept) in the outcome model here
X_outcome = matched_data[['intercept', treatment_col]]
y_outcome = matched_data[outcome_col]

logit_sm = sm.Logit(y_outcome, X_outcome)
result_sm = logit_sm.fit(disp=0)  # disp=0 hides optimization output

print(result_sm.summary())

# Extract OR & 95% CI
params = result_sm.params
conf = result_sm.conf_int()
odds_ratios = np.exp(params)
conf_odds = np.exp(conf)

print("\nOdds Ratios:\n", odds_ratios)
print("\n95% Confidence Intervals:\n", conf_odds)

del df