In [1]:
import numpy as np
import pandas as pd
import os
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.sql.functions import col, count, mean, sum, avg, stddev, min, max, lit
from pyspark.ml.stat import Summarizer
from pyspark.ml.classification import LogisticRegression

In [2]:
path_to_data = os.path.join(os.getcwd(),'data')
path_to_data

'/home/mu_research2/work/University of Missouri/DRIVERS/data'

In [3]:
spark.sql("use real_world_data_ed_omop_nov_2024")
spark.sql("show tables").toPandas()

Unnamed: 0,database,tableName,isTemporary
0,real_world_data_ed_omop_nov_2024,care_site,False
1,real_world_data_ed_omop_nov_2024,cdm_source,False
2,real_world_data_ed_omop_nov_2024,cohort,False
3,real_world_data_ed_omop_nov_2024,cohort_definition,False
4,real_world_data_ed_omop_nov_2024,comorb_pre_delivery_wide,False
5,real_world_data_ed_omop_nov_2024,concept,False
6,real_world_data_ed_omop_nov_2024,concept_ancestor,False
7,real_world_data_ed_omop_nov_2024,concept_class,False
8,real_world_data_ed_omop_nov_2024,concept_relationship,False
9,real_world_data_ed_omop_nov_2024,concept_synonym,False


In [4]:
spark.catalog.refreshTable("delivery_elig_init_smm")
delivery_elig_init_smm = spark.sql('''
    select * from delivery_elig_init_smm
''').cache()
delivery_elig_init_smm.first()

Row(person_id=15933, event_id=1, event_start_date=datetime.date(2019, 6, 8), event_end_date=datetime.date(2019, 6, 9), delivery_type='vaginalDelivery', los=1, visit_occurrence_id=2972117388989, year_of_birth=1977, age_at_event=42, race_source_value='White', ethnicity_source_value='Hispanic or Latino', location_id=23, care_site_id=100, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', zip_code_region='W', death_date=None, death_ind=0, delivery_idx=1, agegrp_at_event='agegrp6', los2up_ind=0, los3up_ind=0, los4up_ind=0, los5up_ind=0, los6up_ind=0, los7up_ind=0, censor_since_index=29426, death_since_index=29426, SMMANY_ind=1, SMMANY_since_index=228, AMI_since_index=29426, AMI_ind=0, ANE_since_index=29426, ANE_ind=0, ARF_since_index=29426, ARF_ind=0, ARDS_since_index=29426, ARDS_ind=0, AFE_since_index=29426, AFE_ind=0, CAVF_since_index=29426, CAVF_ind=0, COCR_since_index=29426, COCR_ind=0, DIC_since_index=29426, DIC_ind=0, EC

In [5]:
spark.catalog.refreshTable("delivery_elig_smm")
delivery_elig_smm = spark.sql('''
    select * from delivery_elig_smm
''').cache()
delivery_elig_smm.first()

Row(person_id=4473, event_id=1, event_start_date=datetime.date(2020, 8, 30), event_end_date=datetime.date(2020, 9, 1), delivery_type='vaginalDelivery', los=2, visit_occurrence_id=3109556421910, year_of_birth=1993, age_at_event=27, race_source_value='American Indian or Alaska Native', ethnicity_source_value='Non-Hispanic', location_id=23, care_site_id=100, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', zip_code_region='W', death_date=None, death_ind=0, delivery_idx=1, agegrp_at_event='agegrp3', los2up_ind=1, los3up_ind=0, los4up_ind=0, los5up_ind=0, los6up_ind=0, los7up_ind=0, censor_since_index=1519, death_since_index=1519, SMMANY_ind=0, SMMANY_since_index=1519, AMI_since_index=1519, AMI_ind=0, ANE_since_index=1519, ANE_ind=0, ARF_since_index=1519, ARF_ind=0, ARDS_since_index=1519, ARDS_ind=0, AFE_since_index=1519, AFE_ind=0, CAVF_since_index=1519, CAVF_ind=0, COCR_since_index=1519, COCR_ind=0, DIC_since_index=1519, D

In [6]:
denom_df = pd.DataFrame(
    {
        'N_delivery': delivery_elig_smm.count(),
        'N_person': delivery_elig_smm.select('person_id').distinct().count(),
        'N_init_delivery':delivery_elig_init_smm.count(),
        'Dt_first':delivery_elig_smm.agg({"event_start_date": "min"}).collect()[0][0],
        'Dt_last':delivery_elig_smm.agg({"event_start_date": "max"}).collect()[0][0],
        'LOS_90PCT':delivery_elig_smm.approxQuantile('los',[0.9],0.01)
    }
)
denom_df.to_csv(os.path.join(path_to_data,'denom_summ.csv'),index=False)

In [7]:
# one hot encoding
def ohe_with_map(
    df,              # spark dataframe
    cat_cols         # list of categorical columns for ohe
):   
    # Index the categorical columns
    indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", stringOrderType="frequencyAsc").fit(df) for col in cat_cols]
    
    # Apply the indexers to the DataFrame and collect the encoding map
    index_maps = {}
    for indexer in indexers:
        df = indexer.transform(df)
        index_maps[indexer.getInputCol()] = dict(enumerate(indexer.labels))
    
    # One-hot encode the indexed columns
    encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_ohe") for col in cat_cols]
    for encoder in encoders:
        df = encoder.transform(df)
    
    return df, index_maps 

In [8]:
delivery_elig_init_smm_ohe = ohe_with_map(
    delivery_elig_init_smm,
    cat_cols = [
        "race_source_value",
        "ethnicity_source_value",
        "agegrp_at_event",
        "bed_size",
        "segment",
        "speciality",
        "zip_code",
        "delivery_type",
        "zip_code_region"
    ]
)
delivery_elig_init_smm2 = delivery_elig_init_smm_ohe[0]
delivery_elig_init_smm2.first()

Row(person_id=15933, event_id=1, event_start_date=datetime.date(2019, 6, 8), event_end_date=datetime.date(2019, 6, 9), delivery_type='vaginalDelivery', los=1, visit_occurrence_id=2972117388989, year_of_birth=1977, age_at_event=42, race_source_value='White', ethnicity_source_value='Hispanic or Latino', location_id=23, care_site_id=100, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', zip_code_region='W', death_date=None, death_ind=0, delivery_idx=1, agegrp_at_event='agegrp6', los2up_ind=0, los3up_ind=0, los4up_ind=0, los5up_ind=0, los6up_ind=0, los7up_ind=0, censor_since_index=29426, death_since_index=29426, SMMANY_ind=1, SMMANY_since_index=228, AMI_since_index=29426, AMI_ind=0, ANE_since_index=29426, ANE_ind=0, ARF_since_index=29426, ARF_ind=0, ARDS_since_index=29426, ARDS_ind=0, AFE_since_index=29426, AFE_ind=0, CAVF_since_index=29426, CAVF_ind=0, COCR_since_index=29426, COCR_ind=0, DIC_since_index=29426, DIC_ind=0, EC

In [9]:
delivery_elig_init_smm_ohe[1]

{'race_source_value': {0: 'Native Hawaiian or Other Pacific Islander',
  1: 'American Indian or Alaska Native',
  2: 'Unknown',
  3: 'Asian',
  4: 'Black or African American',
  5: 'Other',
  6: 'White'},
 'ethnicity_source_value': {0: 'Unknown',
  1: 'Hispanic or Latino',
  2: 'Non-Hispanic'},
 'agegrp_at_event': {0: 'agegrp1',
  1: 'agegrp6',
  2: 'agegrp5',
  3: 'agegrp2',
  4: 'agegrp3',
  5: 'agegrp4'},
 'bed_size': {0: 'NI',
  1: '200-299',
  2: '<100',
  3: '100-199',
  4: '300-499',
  5: '500-999',
  6: '>=1000'},
 'segment': {0: 'Hospital', 1: 'Health System'},
 'speciality': {0: 'Childrens',
  1: 'Critical Access Hospital',
  2: 'Academic',
  3: 'Acute Care Hospital',
  4: 'Health System'},
 'zip_code': {0: '4',
  1: '5',
  2: '2',
  3: '7',
  4: '0',
  5: '1',
  6: '6',
  7: '3',
  8: '8',
  9: '9'},
 'delivery_type': {0: 'cSection', 1: 'vaginalDelivery'},
 'zip_code_region': {0: 'MW', 1: 'NE', 2: 'S', 3: 'W'}}

In [11]:
cov_cat_lst = [
     'race_source_value_ohe'
    ,'ethnicity_source_value_ohe'
    ,'bed_size_ohe'
    ,'speciality_ohe'
    ,'segment_ohe'
    ,'zip_code_ohe'
    ,'zip_code_region_ohe'
    ,'agegrp_at_event_ohe'
    ,'delivery_type_ohe'
    ,'los2up_ind'
    ,'los3up_ind'
    ,'los4up_ind'
    ,'los5up_ind'
    ,'los6up_ind'
    ,'los7up_ind'
    ,'hist_PHT_ind'
    ,'hist_PEC_ind'
    ,'hist_GHT_ind'
    ,'hist_PAS_ind'
    ,'hist_CKD_ind'
    ,'hist_HD_ind'
    ,'hist_HIV_ind'
    ,'hist_PA_ind'
    ,'hist_CD_ind'
    ,'hist_ANEM_ind'
    ,'hist_MULTIPG_ind'
    ,'hist_PP_ind'
    ,'hist_NMD_ind'
    ,'hist_ASTH_ind'
    ,'hist_AID_ind'
    ,'hist_UF_ind'
    ,'hist_GID_ind'
    ,'hist_CH_ind'
    ,'hist_PD_ind'
    ,'hist_DM_ind'
    ,'hist_THYRO_ind'
    ,'hist_CSEC_ind'
    ,'hist_GDM_ind'
    ,'hist_IHD_ind'
    ,'hist_ARRY_ind'
    ,'hist_CHF_ind'
    ,'hist_CHD_ind'
    ,'hist_RA_ind'
    ,'hist_PVD_ind'
    ,'hist_EPI_ind'
    ,'hist_CVD_ind'
    ,'hist_SCD_ind'
    ,'hist_SUD_ind'
    ,'hist_AUD_ind'
    ,'hist_COVID_ind'
]
cov_num_lst = [
    'age_at_event'
    ,'los'
    ,'hist_PHT_since_index'
    ,'hist_PEC_since_index'
    ,'hist_GHT_since_index'
    ,'hist_PAS_since_index'
    ,'hist_CKD_since_index'
    ,'hist_HD_since_index'
    ,'hist_HIV_since_index'
    ,'hist_PA_since_index'
    ,'hist_CD_since_index'
    ,'hist_ANEM_since_index'
    ,'hist_MULTIPG_since_index'
    ,'hist_PP_since_index'
    ,'hist_NMD_since_index'
    ,'hist_ASTH_since_index'
    ,'hist_AID_since_index'
    ,'hist_UF_since_index'
    ,'hist_GID_since_index'
    ,'hist_CH_since_index'
    ,'hist_PD_since_index'
    ,'hist_DM_since_index'
    ,'hist_THYRO_since_index'
    ,'hist_CSEC_since_index'
    ,'hist_GDM_since_index'
    ,'hist_IHD_since_index'
    ,'hist_ARRY_since_index'
    ,'hist_CHF_since_index'
    ,'hist_CHD_since_index'
    ,'hist_RA_since_index'
    ,'hist_PVD_since_index'
    ,'hist_EPI_since_index'
    ,'hist_CVD_since_index'
    ,'hist_SCD_since_index'
    ,'hist_SUD_since_index'
    ,'hist_AUD_since_index'
    ,'hist_COVID_since_index'
]

cov_lst = cov_cat_lst + cov_num_lst

out_lst = [
     'death_ind' #961, 
    ,'SMMANY_ind' #30267, 1.3%
    ,"SMMANY90PCT_ind"
#     ,'AMI_ind'
#     ,'ANE_ind'
#     ,'ARF_ind'
#     ,'ARDS_ind'
#     ,'AFE_ind'
#     ,'CAVF_ind'
#     ,'COCR_ind' #386
#     ,'DIC_ind'
#     ,'ECL_ind'
#     ,'HF_ind'
#     ,'PCD_ind'
#     ,'PE_ind'
#     ,'SAC_ind'
#     ,'SEP_ind'
#     ,'SSH_ind'
#     ,'SCC_ind'
#     ,'ATE_ind'
#     ,'BPT_ind' #26734,1.1%
#     ,'HYS_ind' #3188, 0.1%
#     ,'TT_ind' #119
#     ,'VEN_ind' #1946
]

since_index_lst = [
     'death_since_index'
    ,'SMMANY_since_index'
    ,'SMMANY90PCT_since_index'
]

In [12]:
def summ_gen(
    df, 
    cols,
    cat_switch = False,
    outcome = None
):
    N = df.count()
    res_dfs = []
    for col in cols:
        if cat_switch:
            if outcome:
                summary = df.groupBy(col,outcome).agg(
                    count(col).alias("count"),
                    (count(col)/N).alias("prop")
                ).withColumn("var", lit(col)).withColumnRenamed(col, "cat")
            else:
                summary = df.groupBy(col).agg(
                    count(col).alias("count"),
                    (count(col)/N).alias("prop")
                ).withColumn("var", lit(col)).withColumnRenamed(col, "cat")
        else:
            if outcome:
                summary = df.groupBy(outcome).agg(
                    lit(col).alias("var"),
                    mean(col).alias("mean"),
                    stddev(col).alias("stddev"),
                    min(col).alias("min"),
                    max(col).alias("max")
                ).withColumn("var", lit(col))
            else:
                summary = df.agg(
                    lit(col).alias("var"),
                    mean(col).alias("mean"),
                    stddev(col).alias("stddev"),
                    min(col).alias("min"),
                    max(col).alias("max")
                ).withColumn("var", lit(col))
        
        res_dfs.append(summary.toPandas())

    # Concatenate the Pandas DataFrames into a single DataFrame
    res = pd.concat(res_dfs, ignore_index=True)
    return res

In [13]:
res_init_num = summ_gen(
    delivery_elig_init_smm2,
    cov_num_lst+out_lst+since_index_lst
)
res_init_num.to_csv(os.path.join(path_to_data,'summ_init_num.csv'),index=False)

In [14]:
res_init_num = summ_gen(
    delivery_elig_init_smm2,
    cov_num_lst+out_lst+since_index_lst,
    outcome = "SMMANY_IND"
)
res_init_num.to_csv(os.path.join(path_to_data,'summ_num_init_smm.csv'),index=False)

In [15]:
res_init_num = summ_gen(
    delivery_elig_init_smm2,
    cov_num_lst+out_lst+since_index_lst,
    outcome = "SMMANY90PCT_IND"
)
res_init_num.to_csv(os.path.join(path_to_data,'summ_num_init_smm90pct.csv'),index=False)

In [16]:
res_init_num = summ_gen(
    delivery_elig_init_smm2,
    cov_num_lst+out_lst+since_index_lst,
    outcome = "death_ind"
)
res_init_num.to_csv(os.path.join(path_to_data,'summ_num_init_dth.csv'),index=False)

In [25]:
cov_cat_ind_lst = [
     'los2up_ind'
    ,'los3up_ind'
    ,'los4up_ind'
    ,'los5up_ind'
    ,'los6up_ind'
    ,'los7up_ind'
    ,'hist_PHT_ind'
    ,'hist_PEC_ind'
    ,'hist_GHT_ind'
    ,'hist_PAS_ind'
    ,'hist_CKD_ind'
    ,'hist_HD_ind'
    ,'hist_HIV_ind'
    ,'hist_PA_ind'
    ,'hist_CD_ind'
    ,'hist_ANEM_ind'
    ,'hist_MULTIPG_ind'
    ,'hist_PP_ind'
    ,'hist_NMD_ind'
    ,'hist_ASTH_ind'
    ,'hist_AID_ind'
    ,'hist_UF_ind'
    ,'hist_GID_ind'
    ,'hist_CH_ind'
    ,'hist_PD_ind'
    ,'hist_DM_ind'
    ,'hist_THYRO_ind'
    ,'hist_CSEC_ind'
    ,'hist_GDM_ind'
    ,'hist_IHD_ind'
    ,'hist_ARRY_ind'
    ,'hist_CHF_ind'
    ,'hist_CHD_ind'
    ,'hist_RA_ind'
    ,'hist_PVD_ind'
    ,'hist_EPI_ind'
    ,'hist_CVD_ind'
    ,'hist_SCD_ind'
    ,'hist_SUD_ind'
    ,'hist_AUD_ind'
    ,'hist_COVID_ind'
]

In [24]:
cov_cat_lst = [
     'race_source_value'
    ,'ethnicity_source_value'
    ,'bed_size'
    ,'speciality'
    ,'segment'
    ,'zip_code'
    ,'zip_code_region'
    ,'agegrp_at_event'
    ,'delivery_type'
] + cov_cat_ind_lst
res_init_cat = summ_gen(
    delivery_elig_init_smm2,
    cov_cat_lst+out_lst,
    cat_switch = True
)
res_init_cat.to_csv(os.path.join(path_to_data,'summ_init_cat.csv'),index=False)

In [18]:
res_init_cat = summ_gen(
    delivery_elig_init_smm2,
    cov_cat_lst,
    cat_switch = True,
    outcome = "SMMANY_IND"
)
res_init_cat.to_csv(os.path.join(path_to_data,'summ_init_cat_smm.csv'),index=False)

In [19]:
res_init_cat = summ_gen(
    delivery_elig_init_smm2,
    cov_cat_lst,
    cat_switch = True,
    outcome = "SMMANY90PCT_IND"
)
res_init_cat.to_csv(os.path.join(path_to_data,'summ_init_cat_smm90pct.csv'),index=False)

In [20]:
res_init_cat = summ_gen(
    delivery_elig_init_smm2,
    cov_cat_lst,
    cat_switch = True,
    outcome = "death_ind"
)
res_init_cat.to_csv(os.path.join(path_to_data,'summ_init_cat_dth.csv'),index=False)

In [21]:
def univar_analysis(
    df,              # spark dataframe
    covariate_cols,  # list of covariates (assume ohe already applied)
    outcome_cols,    # list of outcomes
    outcome_types,   # list of outcome types
    verbose = True   # report progress
):
    # global glm family mapping based on outcome types
    family_map = {
        "bin": "binomial",
        "con": "gaussian",
        "dis": "poisson",
        "pos": "gamma",
        "mix": "tweedie"
    }
    
    odds_ratios = {}
    for idx, outcome in enumerate(outcome_cols):  
        for idx2, covariate in enumerate(covariate_cols):              
            # Fit univariate glm
            vector_assembler = VectorAssembler(inputCols=[covariate], outputCol="features")
            df_assembled = vector_assembler.transform(df)
            glr = GeneralizedLinearRegression(
                family=family_map[outcome_types[idx]], 
                link="logit",
                featuresCol="features", 
                labelCol=outcome
            )
            model = glr.fit(df_assembled)
            summary = model.summary

            # Extract coefficients and calculate odds ratios
            for i, coef in enumerate(model.coefficients):
                odds_ratio = np.exp(coef) 
                if hasattr(summary, 'coefficientStandardErrors'):
                    coefficient_standard_error = summary.coefficientStandardErrors[i]
                else:
                    coefficient_standard_error = 0
                conf_lower = np.exp(coef - 1.96 * coefficient_standard_error)
                conf_upper= np.exp(coef + 1.96 * coefficient_standard_error)
                
                # gather results
                odds_ratios[f"{outcome}_{covariate}_{i}"]={
                    'outcome': outcome,
                    'var': covariate,
                    "encoded": i,
                    "coef": coef,
                    "odds_ratio": odds_ratio,
                    "conf_lower": conf_lower,
                    "conf_upper": conf_upper,
                    "pval": summary.pValues[i]
                }
                
            # report progress         
            if verbose:
                print(f"processed:outcome={outcome};covariate={covariate} \n")
                
    return odds_ratios

In [26]:
cov_cat_lst = [
     'race_source_value_ohe'
    ,'ethnicity_source_value_ohe'
    ,'bed_size_ohe'
    ,'speciality_ohe'
    ,'segment_ohe'
    ,'zip_code_ohe'
    ,'zip_code_region_ohe'
    ,'agegrp_at_event_ohe'
    ,'delivery_type_ohe'
] + cov_cat_ind_lst
cov_num_lst = [
     'age_at_event'
    ,'los'
]

cov_lst = cov_cat_lst + cov_num_lst

out_lst = [
     'death_ind' #961, 
    ,'SMMANY_ind' #30267, 1.3%
    ,"SMMANY90PCT_ind"
]
type_lst = ['bin']*len(out_lst)

In [None]:
res = univar_analysis(
    df = delivery_elig_init_smm2,
    covariate_cols = cov_lst,
    outcome_cols = out_lst,
    outcome_types = type_lst
)
res_df = pd.json_normalize(res.values())
res_df.to_csv(os.path.join(path_to_data,'univar_filter_init.csv'), index=False)

processed:outcome=death_ind;covariate=race_source_value_ohe 

processed:outcome=death_ind;covariate=ethnicity_source_value_ohe 

processed:outcome=death_ind;covariate=bed_size_ohe 





processed:outcome=death_ind;covariate=speciality_ohe 

processed:outcome=death_ind;covariate=segment_ohe 

processed:outcome=death_ind;covariate=zip_code_ohe 

processed:outcome=death_ind;covariate=zip_code_region_ohe 

processed:outcome=death_ind;covariate=agegrp_at_event_ohe 

processed:outcome=death_ind;covariate=delivery_type_ohe 

processed:outcome=death_ind;covariate=los2up_ind 

processed:outcome=death_ind;covariate=los3up_ind 

processed:outcome=death_ind;covariate=los4up_ind 

processed:outcome=death_ind;covariate=los5up_ind 

processed:outcome=death_ind;covariate=los6up_ind 

processed:outcome=death_ind;covariate=los7up_ind 

