In [1]:
import numpy as np
import pandas as pd
import os
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.sql.functions import col, count, mean, sum, avg, stddev, min, max, lit
from pyspark.ml.stat import Summarizer
from pyspark.ml.classification import LogisticRegression

In [2]:
path_to_data = os.path.join(os.getcwd(),'data')
path_to_data

'/home/mu_research2/work/University of Missouri/DRIVERS/data'

In [3]:
spark.sql("use real_world_data_ed_omop_nov_2024")
spark.sql("show tables").toPandas()

Unnamed: 0,database,tableName,isTemporary
0,real_world_data_ed_omop_nov_2024,care_site,False
1,real_world_data_ed_omop_nov_2024,cdm_source,False
2,real_world_data_ed_omop_nov_2024,cohort,False
3,real_world_data_ed_omop_nov_2024,cohort_definition,False
4,real_world_data_ed_omop_nov_2024,comorb_pre_delivery_wide,False
5,real_world_data_ed_omop_nov_2024,concept,False
6,real_world_data_ed_omop_nov_2024,concept_ancestor,False
7,real_world_data_ed_omop_nov_2024,concept_class,False
8,real_world_data_ed_omop_nov_2024,concept_relationship,False
9,real_world_data_ed_omop_nov_2024,concept_synonym,False


In [4]:
spark.catalog.refreshTable("delivery_elig_init_smm")
delivery_elig_init_smm = spark.sql('''
    select * from delivery_elig_init_smm
''').cache()
delivery_elig_init_smm.first()

Row(person_id=15933, event_id=1, event_start_date=datetime.date(2019, 6, 8), event_end_date=datetime.date(2019, 6, 9), delivery_type='vaginalDelivery', los=1, visit_occurrence_id=2972117388989, year_of_birth=1977, age_at_event=42, race_source_value='White', ethnicity_source_value='Hispanic or Latino', location_id=23, care_site_id=100, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', zip_code_region='W', death_date=None, death_ind=0, delivery_idx=1, agegrp_at_event='agegrp6', los2up_ind=0, los3up_ind=0, los4up_ind=0, los5up_ind=0, los6up_ind=0, los7up_ind=0, censor_since_index=29426, death_since_index=29426, SMMANY_ind=1, SMMANY_since_index=228, AMI_since_index=29426, AMI_ind=0, ANE_since_index=29426, ANE_ind=0, ARF_since_index=29426, ARF_ind=0, ARDS_since_index=29426, ARDS_ind=0, AFE_since_index=29426, AFE_ind=0, CAVF_since_index=29426, CAVF_ind=0, COCR_since_index=29426, COCR_ind=0, DIC_since_index=29426, DIC_ind=0, EC

In [7]:
# one hot encoding
def ohe_with_map(
    df,              # spark dataframe
    cat_cols         # list of categorical columns for ohe
):   
    # Index the categorical columns
    indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", stringOrderType="frequencyAsc").fit(df) for col in cat_cols]
    
    # Apply the indexers to the DataFrame and collect the encoding map
    index_maps = {}
    for indexer in indexers:
        df = indexer.transform(df)
        index_maps[indexer.getInputCol()] = dict(enumerate(indexer.labels))
    
    # One-hot encode the indexed columns
    encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_ohe") for col in cat_cols]
    for encoder in encoders:
        df = encoder.transform(df)
    
    return df, index_maps 

In [8]:
delivery_elig_init_smm_ohe = ohe_with_map(
    delivery_elig_init_smm,
    cat_cols = [
        "race_source_value",
        "ethnicity_source_value",
        "agegrp_at_event",
        "bed_size",
        "segment",
        "speciality",
        "zip_code",
        "zip_code_region",
        "delivery_type"
    ]
)
delivery_elig_init_smm2 = delivery_elig_init_smm_ohe[0]
delivery_elig_init_smm2.first()

Row(person_id=15933, event_id=1, event_start_date=datetime.date(2019, 6, 8), event_end_date=datetime.date(2019, 6, 9), delivery_type='vaginalDelivery', los=1, visit_occurrence_id=2972117388989, year_of_birth=1977, age_at_event=42, race_source_value='White', ethnicity_source_value='Hispanic or Latino', location_id=23, care_site_id=100, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', zip_code_region='W', death_date=None, death_ind=0, delivery_idx=1, agegrp_at_event='agegrp6', los2up_ind=0, los3up_ind=0, los4up_ind=0, los5up_ind=0, los6up_ind=0, los7up_ind=0, censor_since_index=29426, death_since_index=29426, SMMANY_ind=1, SMMANY_since_index=228, AMI_since_index=29426, AMI_ind=0, ANE_since_index=29426, ANE_ind=0, ARF_since_index=29426, ARF_ind=0, ARDS_since_index=29426, ARDS_ind=0, AFE_since_index=29426, AFE_ind=0, CAVF_since_index=29426, CAVF_ind=0, COCR_since_index=29426, COCR_ind=0, DIC_since_index=29426, DIC_ind=0, EC

In [9]:
def bootstrap_lasso_logistic(
    df, 
    cov_lst,
    label_col,
    sample_frac = 0.9,
    n_iterations = 10, 
    alpha = 0.05
):
    coefs = []
    
    for i in range(n_iterations):
        # Create a random sample with replacement
        subsampled_df = df.sample(withReplacement=False, fraction=sample_frac)
        
        # train logistic lasso model
        assembler = VectorAssembler(inputCols=cov_lst, outputCol="features")
        train_data = assembler.transform(subsampled_df)
        lasso_logistic = LogisticRegression(featuresCol="features", labelCol=label_col, elasticNetParam=1.0)
        model = lasso_logistic.fit(train_data)
        
        # collect coef
        coefs.append(model.coefficients)

    # Calculate the lower and upper percentiles for the confidence interval
    coef_meds = np.percentile(coefs, 50, axis=0)
    lower_bounds = np.percentile(coefs, 100 * alpha / 2, axis=0)
    upper_bounds = np.percentile(coefs, 100 * (1 - alpha / 2), axis=0)
    feature_name = assembler.getInputCols()
    
    return feature_name, coef_meds, lower_bounds, upper_bounds

In [25]:
# multiple lasso logistic regresion with basic var, smmany
cov_lst = [
     'race_source_value'
    ,'ethnicity_source_value'
    ,'agegrp_at_event'
    ,'los7up_ind'
    ,'delivery_type'        
    ,'bed_size'
    ,'speciality'
    ,'segment'
#     ,'zip_code'
    ,'zip_code_region'
]
ohe_map = delivery_elig_init_smm_ohe[1]
cov_lst_ext = []
for v in cov_lst:
    if v in ohe_map:
        cov_lst_ext.extend(
            [v + f"_{k}" for k in list(ohe_map[v].keys())[:-1]]
        )
    else:
        cov_lst_ext.append(v)
        
feature_name, coef_meds, lower_bounds, upper_bounds = bootstrap_lasso_logistic(
    delivery_elig_init_smm2, 
    cov_lst = [
         'race_source_value_ohe'
        ,'ethnicity_source_value_ohe'
        ,'agegrp_at_event_ohe'
        ,'los7up_ind'
        ,'delivery_type_ohe'        
        ,'bed_size_ohe'
        ,'speciality_ohe'
        ,'segment_ohe'
#         ,'zip_code_ohe'
        ,'zip_code_region_ohe'
    ],
    label_col = 'SMMANY_ind',
    sample_frac = 0.9,
    n_iterations = 20, 
    alpha = 0.05
)

res_mlasso = pd.DataFrame({
    'feature': cov_lst_ext, 
    'coef': coef_meds,
    'lower': lower_bounds,
    'upper': upper_bounds
})
# res_mlasso
res_mlasso.to_csv(os.path.join(path_to_data,'multilasso_smmany_filter_init.csv'), index=False)

In [26]:
# multiple lasso logistic regresion with basic var, smmany with 90 percentile correction
feature_name, coef_meds, lower_bounds, upper_bounds = bootstrap_lasso_logistic(
    delivery_elig_init_smm2, 
    cov_lst = [
         'race_source_value_ohe'
        ,'ethnicity_source_value_ohe'
        ,'agegrp_at_event_ohe'
        ,'los7up_ind'
        ,'delivery_type_ohe'        
        ,'bed_size_ohe'
        ,'speciality_ohe'
        ,'segment_ohe'
#         ,'zip_code_ohe'
        ,'zip_code_region_ohe'
    ],
    label_col = 'SMMANY90PCT_ind',
    sample_frac = 0.9,
    n_iterations = 20, 
    alpha = 0.05
)

res_mlasso = pd.DataFrame({
    'feature': cov_lst_ext, 
    'coef': coef_meds,
    'lower': lower_bounds,
    'upper': upper_bounds
})
# res_mlasso
res_mlasso.to_csv(os.path.join(path_to_data,'multilasso_smmany90pct_filter_init.csv'), index=False)

In [27]:
# multiple lasso logistic regresion with all var, smmany
cov_lst = [
         'race_source_value'
        ,'ethnicity_source_value'
        ,'agegrp_at_event'
        ,'los7up_ind'
        ,'delivery_type'        
        ,'bed_size'
        ,'speciality'
        ,'segment_ohe'
#         ,'zip_code'
        ,'zip_code_region'
        ,'hist_PHT_ind'
        ,'hist_PEC_ind'
        ,'hist_GHT_ind'
        ,'hist_PAS_ind'
        ,'hist_CKD_ind'
        ,'hist_HD_ind'
        ,'hist_HIV_ind'
        ,'hist_PA_ind'
        ,'hist_CD_ind'
        ,'hist_ANEM_ind'
        ,'hist_MULTIPG_ind'
        ,'hist_PP_ind'
        ,'hist_NMD_ind'
        ,'hist_ASTH_ind'
        ,'hist_AID_ind'
        ,'hist_UF_ind'
        ,'hist_GID_ind'
        ,'hist_CH_ind'
        ,'hist_PD_ind'
        ,'hist_DM_ind'
        ,'hist_THYRO_ind'
        ,'hist_CSEC_ind'
        ,'hist_GDM_ind'
        ,'hist_IHD_ind'
        ,'hist_ARRY_ind'
        ,'hist_CHF_ind'
        ,'hist_CHD_ind'
        ,'hist_RA_ind'
        ,'hist_PVD_ind'
        ,'hist_EPI_ind'
        ,'hist_CVD_ind'
        ,'hist_SCD_ind'
        ,'hist_SUD_ind'
        ,'hist_AUD_ind'
        ,'hist_COVID_ind'
    ]
ohe_map = delivery_elig_init_smm_ohe[1]
cov_lst_ext = []
for v in cov_lst:
    if v in ohe_map:
        cov_lst_ext.extend(
            [v + f"_{k}" for k in list(ohe_map[v].keys())[:-1]]
        )
    else:
        cov_lst_ext.append(v)
        
feature_name, coef_meds, lower_bounds, upper_bounds = bootstrap_lasso_logistic(
    delivery_elig_init_smm2, 
    cov_lst = [
         'race_source_value_ohe'
        ,'ethnicity_source_value_ohe'
        ,'agegrp_at_event_ohe'
        ,'los7up_ind'
        ,'delivery_type_ohe'        
        ,'bed_size_ohe'
        ,'speciality_ohe'
        ,'segment_ohe'
#         ,'zip_code_ohe'
        ,'zip_code_region_ohe'
        ,'hist_PHT_ind'
        ,'hist_PEC_ind'
        ,'hist_GHT_ind'
        ,'hist_PAS_ind'
        ,'hist_CKD_ind'
        ,'hist_HD_ind'
        ,'hist_HIV_ind'
        ,'hist_PA_ind'
        ,'hist_CD_ind'
        ,'hist_ANEM_ind'
        ,'hist_MULTIPG_ind'
        ,'hist_PP_ind'
        ,'hist_NMD_ind'
        ,'hist_ASTH_ind'
        ,'hist_AID_ind'
        ,'hist_UF_ind'
        ,'hist_GID_ind'
        ,'hist_CH_ind'
        ,'hist_PD_ind'
        ,'hist_DM_ind'
        ,'hist_THYRO_ind'
        ,'hist_CSEC_ind'
        ,'hist_GDM_ind'
        ,'hist_IHD_ind'
        ,'hist_ARRY_ind'
        ,'hist_CHF_ind'
        ,'hist_CHD_ind'
        ,'hist_RA_ind'
        ,'hist_PVD_ind'
        ,'hist_EPI_ind'
        ,'hist_CVD_ind'
        ,'hist_SCD_ind'
        ,'hist_SUD_ind'
        ,'hist_AUD_ind'
        ,'hist_COVID_ind'
    ],
    label_col = 'SMMANY_ind',
    sample_frac = 0.9,
    n_iterations = 20, 
    alpha = 0.05
)

res_mlasso = pd.DataFrame({
    'feature': cov_lst_ext, 
    'coef': coef_meds,
    'lower': lower_bounds,
    'upper': upper_bounds
})
# res_mlasso
res_mlasso.to_csv(os.path.join(path_to_data,'multilasso_smmany_filter_init2.csv'), index=False)

In [16]:
# multiple lasso logistic regresion with all var, smmany with 90 percentile correction
feature_name, coef_meds, lower_bounds, upper_bounds = bootstrap_lasso_logistic(
    delivery_elig_init_smm2, 
    cov_lst = [
         'race_source_value_ohe'
        ,'ethnicity_source_value_ohe'
        ,'agegrp_at_event_ohe'
        ,'los7up_ind'
        ,'delivery_type_ohe'        
        ,'bed_size_ohe'
        ,'speciality_ohe'
        ,'segment_ohe'
#         ,'zip_code_ohe'
        ,'zip_code_region_ohe'
        ,'hist_PHT_ind'
        ,'hist_PEC_ind'
        ,'hist_GHT_ind'
        ,'hist_PAS_ind'
        ,'hist_CKD_ind'
        ,'hist_HD_ind'
        ,'hist_HIV_ind'
        ,'hist_PA_ind'
        ,'hist_CD_ind'
        ,'hist_ANEM_ind'
        ,'hist_MULTIPG_ind'
        ,'hist_PP_ind'
        ,'hist_NMD_ind'
        ,'hist_ASTH_ind'
        ,'hist_AID_ind'
        ,'hist_UF_ind'
        ,'hist_GID_ind'
        ,'hist_CH_ind'
        ,'hist_PD_ind'
        ,'hist_DM_ind'
        ,'hist_THYRO_ind'
        ,'hist_CSEC_ind'
        ,'hist_GDM_ind'
        ,'hist_IHD_ind'
        ,'hist_ARRY_ind'
        ,'hist_CHF_ind'
        ,'hist_CHD_ind'
        ,'hist_RA_ind'
        ,'hist_PVD_ind'
        ,'hist_EPI_ind'
        ,'hist_CVD_ind'
        ,'hist_SCD_ind'
        ,'hist_SUD_ind'
        ,'hist_AUD_ind'
        ,'hist_COVID_ind'
    ],
    label_col = 'SMMANY90PCT_ind',
    sample_frac = 0.9,
    n_iterations = 20, 
    alpha = 0.05
)

res_mlasso = pd.DataFrame({
    'feature': cov_lst_ext, 
    'coef': coef_meds,
    'lower': lower_bounds,
    'upper': upper_bounds
})
# res_mlasso
res_mlasso.to_csv(os.path.join(path_to_data,'multilasso_smmany90pct_filter_init2.csv'), index=False)