In [None]:
import numpy as np
import pandas as pd
import json
import urllib.request as urlreq
import os
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.sql.functions import col, count, mean, sum, avg, stddev, min, max, lit
from pyspark.ml.stat import Summarizer
from pyspark.ml.classification import LogisticRegression

In [None]:
path_to_data = os.path.join(os.getcwd(),'data')
path_to_data

In [None]:
spark.sql("use real_world_data_ed_omop_dec_2023")

In [None]:
def json_to_qry(url_to_json):
    json_url = urlreq.urlopen(url_to_json)
    json_file = json.loads(json_url.read())
    qry_lst = []
    def add_quote(lst):
        lst_quote = ["'"+str(x)+"'" for x in lst]
        return (lst_quote)
    for k,v in json_file.items():
        for cd,sig in v.items():
            if cd=='long': continue
            # entail the range
            if 'range' in sig:
                for x in sig['range']:
                    key_quote = [str(y) for y in list(range(int(x.split('-')[0]),int(x.split('-')[1])+1))]
                    sig['exact'].extend(key_quote)

            # generate dynamic queries
            qry = '''
                select ''' + "'" + k + "'" + ''' as CD_GRP, 
                       ''' + "'" + v['long'] + "'" + ''' as CD_GRP_LONG,
                       concept_id,concept_name,concept_code,vocabulary_id,domain_id
                from concept
                where vocabulary_id = '''+ "'" + cd.upper() + "'" +''' and
            '''
            if 'icd' in cd and 'pcs' not in cd:
                where_lev0 = '''substring_index(concept_code,'.',1) in ('''+ ','.join(add_quote(sig['lev0'])) +''')''' if sig['lev0'] else None
                where_lev1 = '''substring(concept_code,1,5) in ('''+ ','.join(add_quote(sig['lev1'])) +''')''' if sig['lev1'] else None
                where_lev2 = '''substring(concept_code,1,6) in ('''+ ','.join(add_quote(sig['lev2'])) +''')''' if sig['lev2'] else None
                where_nonempty = [s for s in [where_lev0,where_lev1,where_lev2] if s is not None]

                qry += '''
                (
                     ''' + ' or '.join(where_nonempty) + '''  
                )         
                '''
            else:
                qry += '''
                (
                     concept_code in ('''+ ','.join(add_quote(sig['exact'])) +''')
                )         
                '''
            qry_lst.append(qry)
            
    return qry_lst

In [None]:
# load delivery code list and save as a temp view
qry_lst = json_to_qry('https://raw.githubusercontent.com/RWD2E/phecdm/main/res/valueset_curated/vs-mmm-delivery.json')

delivery_omop_meta = spark.sql(' union all '.join(qry_lst)).cache()
delivery_omop_meta.createOrReplaceTempView("delivery_omop_meta")
delivery_omop_meta.first()

In [None]:
delivery_init = spark.sql('''
    select obs.person_id,
           obs.visit_occurrence_id, 
           obs.observation_date as event_date, 
           m.CD_GRP as delivery_type,
           m.vocabulary_id as event_source
    from observation obs
    join delivery_omop_meta m
    on obs.observation_concept_id = m.concept_id
    where upper(m.vocabulary_id) in ('DRG') 
    union all
    select px.person_id,
           px.visit_occurrence_id, 
           px.procedure_date as event_date,
           m.CD_GRP as delivery_type,
           m.vocabulary_id as event_source
    from procedure_occurrence px
    join delivery_omop_meta m
    on px.procedure_concept_id = m.concept_id
    where upper(m.vocabulary_id) in ('CPT4','HCPCS','ICD9PROC','ICD10PCS')
    union all
    select dx.person_id,
           dx.visit_occurrence_id, 
           dx.condition_start_date as event_date,
           m.CD_GRP as delivery_type,
           m.vocabulary_id as event_source
    from condition_occurrence dx
    join delivery_omop_meta m
    on dx.condition_concept_id = m.concept_id
    where upper(m.vocabulary_id) in ('ICD9CM','ICD10CM')
''').cache()
delivery_init.createOrReplaceTempView("delivery_init")
# delivery_init.write.saveAsTable("delivery_init")
delivery_init.first()

In [None]:
spark.sql('''
    select event_source, count(distinct person_id)
    from delivery_init
    group by event_source 
''').toPandas()

	event_source	count(DISTINCT person_id)
0	CPT4	369241
1	ICD10PCS	1318294

In [None]:
spark.sql('''
    select delivery_type, count(distinct person_id)
    from delivery_init
    group by delivery_type 
''').toPandas()

	delivery_type	count(DISTINCT person_id)
0	d_v	954939
1	d_c	582741
2	d_e	7196

In [None]:
delivery_ip = spark.sql('''
    select distinct
           a.person_id, 
           a.visit_occurrence_id,
           v.visit_start_date,
           v.visit_end_date,
           v.care_site_id
    from delivery_init a 
    join visit_occurrence v 
    on a.person_id = v.person_id and 
       a.visit_occurrence_id = v.visit_occurrence_id
    where v.visit_concept_id in (
            9201, -- IP
            9203 -- ED
         ) or 
         v.visit_source_value in (
        'I|2.16.840.1.113883.12.4|Inpatient',
        'P|2.16.840.1.113883.12.4|Preadmit',
        'E|2.16.840.1.113883.12.4|Emergency',
        'B|2.16.840.1.113883.12.4|Obstetrics'
        )
''').cache()
delivery_ip.createOrReplaceTempView("delivery_ip")
delivery_ip.first()

In [None]:
delivery_consolidate = spark.sql('''
    with cd_filter as (
        select v.person_id, 
               v.visit_occurrence_id,
               v.care_site_id,
               v.visit_start_date,
               v.visit_end_date,
               a.delivery_type,
               a.event_source,
               a.event_date,
               row_number() over (partition by v.person_id, v.visit_occurrence_id, a.event_source order by a.event_date) as rn_asc,
               row_number() over (partition by v.person_id, v.visit_occurrence_id, a.event_source order by a.event_date desc) as rn_desc
        from delivery_ip v
        join delivery_init a 
        on v.person_id = a.person_id and 
           v.visit_occurrence_id = a.visit_occurrence_id
        where a.event_date between date_sub(v.visit_start_date,3) and date_add(v.visit_end_date,3)
    ), f_pvt as (
        select * 
        from (
            select person_id, visit_occurrence_id,
                   event_source, event_date
            from cd_filter
            where rn_asc = 1       
        )
        pivot (
            min(event_date) for event_source in (
                'DRG' as F_DRG_DT,'CPT4' as F_CPT_DT,'ICD10PCS' as F_ICD_DT
            )
        )
    ), l_pvt as (
        select * 
        from (
            select person_id, visit_occurrence_id,
                   event_source, event_date
            from cd_filter
            where rn_desc = 1       
        )
        pivot (
            max(event_date) for event_source in (
                'DRG' as L_DRG_DT,'CPT4' as L_CPT_DT,'ICD10PCS' as L_ICD_DT
            )
        )
    ), dtype_pvt as (
        select * 
        from (
            select person_id, visit_occurrence_id,
                   event_source, delivery_type
            from cd_filter
            where rn_desc = 1 
        )
        pivot (
            max(delivery_type) for event_source in (
                'DRG' as DTYPE_DRG,'CPT4' as DTYPE_CPT,'ICD10PCS' as DTYPE_ICD
            )
        )
    )
    select a.person_id, 
           a.visit_occurrence_id,
           a.visit_start_date,
           a.visit_end_date,
           a.care_site_id,
           f.F_DRG_DT,l.L_DRG_DT,d.DTYPE_DRG,
           f.F_CPT_DT,l.L_CPT_DT,d.DTYPE_CPT,
           f.F_ICD_DT,l.L_ICD_DT,d.DTYPE_ICD
    from delivery_ip a 
    left join f_pvt f on a.person_id = f.person_id and a.visit_occurrence_id = f.visit_occurrence_id
    left join l_pvt l on a.person_id = l.person_id and a.visit_occurrence_id = l.visit_occurrence_id
    left join dtype_pvt d on a.person_id = d.person_id and a.visit_occurrence_id = d.visit_occurrence_id
''').cache()
delivery_consolidate.createOrReplaceTempView("delivery_consolidate")
# delivery_consolidate.write.mode('overwrite').saveAsTable("delivery_consolidate")
delivery_consolidate.first()

In [None]:
Row(person_id=2, event_id=1, visit_occurrence_id=16982301016989, care_site_source_value='1', delivery_type='d_c', event_start_date=datetime.date(2019, 5, 2), event_end_date=datetime.date(2019, 5, 5))

In [None]:
delivery_elig = spark.sql('''
    with date_consolid as (
        select distinct 
               person_id,
               visit_occurrence_id,
               care_site_id,
               coalesce(DTYPE_DRG,DTYPE_ICD,DTYPE_CPT) as delivery_type,
               coalesce(F_DRG_DT,visit_start_date,F_ICD_DT,F_CPT_DT) as event_start_dt,
               coalesce(L_DRG_DT,visit_end_date,L_ICD_DT,L_CPT_DT) as event_end_dt
        from delivery_consolidate    
    ), visit_diffs as (
        select a.*, 
               lag(a.event_start_dt, 1, '1899-12-31') OVER (PARTITION BY person_id ORDER BY event_start_dt) AS last_event_start_dt
        from date_consolid a 
    ), visit_session as (
        select b.*, 
               case 
                   when datediff(b.event_start_dt,b.last_event_start_dt) > 211 then 1
                   else 0 
               end as new_session_flag
        from visit_diffs b
    ), sessions as (
        select d.*, 
               sum(d.new_session_flag) over (PARTITION BY d.person_id ORDER BY d.event_start_dt) as event_id
        from visit_session d
    ), session_order as (
        select e.*, 
               row_number() over (partition by e.person_id, e.event_id order by e.event_start_dt) as rn,
               max(e.event_end_dt) over (partition by e.person_id, e.event_id) as event_end_date
    from sessions e
    )
    select s.person_id, 
           s.event_id, 
           s.visit_occurrence_id,
           cs.care_site_source_value,
           s.delivery_type,
           s.event_start_dt as event_start_date,
           s.event_end_date
    from session_order s 
    join care_site cs on s.care_site_id = cs.care_site_id
    where s.rn = 1
    order by s.person_id, s.event_id
''').cache()
# delivery_elig.createOrReplaceTempView("delivery_elig")
delivery_elig.write.mode('overwrite').saveAsTable("delivery_elig")
delivery_elig.first()

Row(person_id=2, event_id=1, visit_occurrence_id=16982301016989, care_site_source_value='1', delivery_type='d_c', event_start_date=datetime.date(2019, 5, 2), event_end_date=datetime.date(2019, 5, 5))

In [None]:
delivery_elig_tbl1 = spark.sql('''
    select d.person_id,
           d.event_id,
           d.event_start_date, 
           d.event_end_date,
           d.delivery_type,
           coalesce(datediff(d.event_end_date,d.event_start_date),0) as los, 
           d.visit_occurrence_id,
           p.year_of_birth,
           year(d.event_start_date) - p.year_of_birth as age_at_event,
           --p.month_of_birth,
           --p.day_of_birth,
           p.race_source_value,
           p.ethnicity_source_value,
           p.location_id,
           p.care_site_id,
           d.care_site_source_value,
           coalesce(tnt.bed_size,'NI') as bed_size,
           tnt.speciality,
           tnt.segment,
           tnt.zip_code,
           dth.death_date,
           case when dth.death_date is not null then 1 else 0 end as death_ind,
           row_number() over (partition by d.person_id order by d.event_start_date) as delivery_idx
    from delivery_elig d
    join person p on d.person_id = p.person_id
    left join tenant_attributes tnt on d.care_site_source_value = tnt.tenant
    left join death dth on d.person_id = dth.person_id 
    where year(d.event_start_date) - p.year_of_birth between 10 and 55
''').cache()
delivery_elig_tbl1.createOrReplaceTempView("delivery_elig_tbl1")
delivery_elig_tbl1.first()

Row(person_id=11567, event_id=1, event_start_date=datetime.date(2024, 3, 4), event_end_date=datetime.date(2024, 3, 5), delivery_type='d_v', los=1, visit_occurrence_id=17454747617679, year_of_birth=1990, age_at_event=34, race_source_value='American Indian or Alaska Native', ethnicity_source_value='Non-Hispanic', location_id=23, care_site_id=101, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', death_date=None, death_ind=0, delivery_idx=1)

In [None]:
# load SMM code list and get omop concept_id
qry_lst = json_to_qry('https://raw.githubusercontent.com/RWD2E/phecdm/main/res/valueset_curated/vs-mmm-smm.json')

smm_omop_meta = spark.sql(' union all '.join(qry_lst)).cache()
smm_omop_meta.createOrReplaceTempView("smm_omop_meta")
smm_omop_meta.first()

Row(CD_GRP='ami', CD_GRP_LONG='acute myocardial infarction', concept_id=44832372, concept_name='Acute myocardial infarction', concept_code='410', vocabulary_id='ICD9CM', domain_id='Condition')

In [None]:
smm_init = spark.sql('''
    select px.person_id,
           px.visit_occurrence_id, 
           px.procedure_date as event_date,
           m.CD_GRP as SMM_GRP
    from procedure_occurrence px
    join smm_omop_meta m
    on px.procedure_concept_id = m.concept_id
    where upper(m.vocabulary_id) in ('CPT4','HCPCS','ICD9PROC','ICD10PCS')
    union all
    select person_id,
           visit_occurrence_id, 
           condition_start_date as event_date,
           m.CD_GRP as SMM_GRP
    from condition_occurrence dx
    join smm_omop_meta m
    on dx.condition_concept_id = m.concept_id
    where upper(m.vocabulary_id) in ('ICD9CM','ICD10CM')
''').cache()
smm_init.createOrReplaceTempView("smm_init")
smm_init.first()

Row(person_id=89592, visit_occurrence_id=17858474179383, event_date=datetime.date(2022, 11, 18), SMM_GRP='hys')

In [None]:
smm_post_delivery = spark.sql('''
    select a.person_id, b.event_id,
           a.SMM_GRP,
           b.event_start_date,a.event_date,
           datediff(a.event_date,b.event_start_date) AS days_since_index,
           b.delivery_idx
    from smm_init a 
    join delivery_elig_tbl1 b 
    on a.person_id = b.person_id
    where datediff(a.event_date,b.event_start_date) between 0 and 365
''').cache()
smm_post_delivery.createOrReplaceTempView("smm_post_delivery")
smm_post_delivery.first()

Row(person_id=29089, event_id=1, SMM_GRP='bpt', event_start_date=datetime.date(2021, 9, 3), event_date=datetime.date(2021, 9, 3), days_since_index=0, delivery_idx=1)

In [None]:
smm_post_delivery_wide = spark.sql('''
    select *
    from (
        select person_id, event_id, SMM_GRP,days_since_index
        from smm_post_delivery
     )
    pivot 
    (
        min(days_since_index) for SMM_GRP in (
            'ami' as AMI_since_index,
            'ane' as ANE_since_index,
            'arf' as ARF_since_index,
            'ards' as ARDS_since_index,
            'afe' as AFE_since_index,
            'cavf' as CAVF_since_index,
            'cocr' as COCR_since_index,
            'dic' as DIC_since_index,
            'ecl' as ECL_since_index,
            'hf' as HF_since_index,
            'pcd' as PCD_since_index,
            'pe' as PE_since_index,
            'sac' as SAC_since_index,
            'sep' as SEP_since_index,
            'ssh' as SSH_since_index,
            'scc' as SCC_since_index,
            'ate' as ATE_since_index,
            'bpt' as BPT_since_index,
            'hys' as HYS_since_index,
            'tt' as TT_since_index,
            'ven' as VEN_since_index
        )
    )
''').cache()
smm_post_delivery_wide.createOrReplaceTempView("smm_post_delivery_wide")
smm_post_delivery_wide.first()

Row(person_id=8591935406, event_id=1, AMI_since_index=None, ANE_since_index=None, ARF_since_index=None, ARDS_since_index=None, AFE_since_index=None, CAVF_since_index=None, COCR_since_index=None, DIC_since_index=None, ECL_since_index=None, HF_since_index=None, PCD_since_index=None, PE_since_index=None, SAC_since_index=None, SEP_since_index=None, SSH_since_index=None, SCC_since_index=None, ATE_since_index=None, BPT_since_index=1, HYS_since_index=None, TT_since_index=None, VEN_since_index=None)

In [None]:
delivery_elig_censor = spark.sql('''
    select a.person_id, a.event_id,
           max(coalesce(a.death_date,v.visit_end_date,v.visit_start_date)) as censor_date,
           datediff(max(coalesce(a.death_date,v.visit_end_date,v.visit_start_date)),a.event_start_date) as censor_since_index
    from delivery_elig_tbl1 a 
    join visit_occurrence v
    on a.person_id = v.person_id
    group by a.person_id,a.event_id,a.event_start_date
''').cache()
delivery_elig_censor.createOrReplaceTempView("delivery_elig_censor")
delivery_elig_censor.first()

Row(person_id=11567, censor_date=datetime.date(2024, 8, 23), censor_since_index=172)

In [None]:
# load comorb code list and get omop concept_id
qry2_lst = json_to_qry('https://raw.githubusercontent.com/RWD2E/phecdm/main/res/valueset_curated/vs-comorb.json')

comorb_omop_meta = spark.sql(' union all '.join(qry2_lst)).cache()
comorb_omop_meta.createOrReplaceTempView("comorb_omop_meta")
comorb_omop_meta.first()

In [None]:
comorb_init = spark.sql('''
    select person_id,
           visit_occurrence_id, 
           condition_start_date as event_date,
           m.CD_GRP as COMORB_GRP
    from condition_occurrence dx
    join comorb_omop_meta m
    on dx.condition_source_concept_id = m.concept_id
    where upper(m.vocabulary_id) in ('ICD9CM','ICD10CM')
''').cache()
comorb_init.createOrReplaceTempView("comorb_init")
comorb_init.first()

Row(person_id=438087336549, visit_occurrence_id=16303696311690, event_date=datetime.date(2019, 12, 4), COMORB_GRP='copd')

In [None]:
comorb_pre_delivery = spark.sql('''
    select a.person_id, b.event_id,
           a.COMORB_GRP,
           b.event_start_date,a.event_date,
           datediff(a.event_date,b.event_start_date) AS days_since_index,
           b.delivery_idx
    from comorb_init a 
    join delivery_elig_tbl1 b 
    on a.person_id = b.person_id
    where datediff(a.event_date,b.event_start_date) < 0
''').cache()
comorb_pre_delivery.createOrReplaceTempView("comorb_pre_delivery")
comorb_pre_delivery.first()

Row(person_id=11567, event_id=1, COMORB_GRP='covid', event_start_date=datetime.date(2024, 3, 4), event_date=datetime.date(2022, 8, 22), days_since_index=-560, delivery_idx=1)

In [None]:
comorb_pre_delivery_wide = spark.sql('''
    select *
    from (
        select person_id, event_id, COMORB_GRP,days_since_index
        from comorb_pre_delivery
     )
    pivot 
    (
        min(days_since_index) for COMORB_GRP in (
            'hep' as hist_HEP_since_index,
            'ihd' as hist_IHD_since_index,
            'ast' as hist_AST_since_index,
            'liv' as hist_LIV_since_index,
            'afib' as hist_AFIB_since_index,
            'str' as hist_STR_since_index,
            'ckd' as hist_CKD_since_index,
            'copd' as hist_COPD_since_index,
            'htn' as hist_HTN_since_index,
            'hf' as hist_HF_since_index,
            'aids' as hist_AIDS_since_index,
            'hld' as hist_HLD_since_index,
            'pvd' as hist_PVD_since_index,
            'ra' as hist_RA_since_index,
            'ad' as hist_AD_since_index,
            'dm' as hist_DM_since_index,
            'covid' as hist_COVID_since_index,
            'sub' as hist_SUB_since_index,
            'alc' as hist_ALC_since_index
        )
    )
''').cache()
comorb_pre_delivery_wide.createOrReplaceTempView("comorb_pre_delivery_wide")
comorb_pre_delivery_wide.first()

Row(person_id=11567, event_id=1, hist_HEP_since_index=None, hist_IHD_since_index=None, hist_AST_since_index=-351, hist_LIV_since_index=None, hist_AFIB_since_index=None, hist_STR_since_index=None, hist_CKD_since_index=None, hist_COPD_since_index=None, hist_HTN_since_index=None, hist_HF_since_index=None, hist_AIDS_since_index=None, hist_HLD_since_index=None, hist_PVD_since_index=None, hist_RA_since_index=None, hist_AD_since_index=None, hist_DM_since_index=None, hist_COVID_since_index=-845, hist_SUB_since_index=None, hist_ALC_since_index=None)

In [None]:
delivery_elig_smm = spark.sql('''
    with smm_any as (
        select person_id, event_id, 1 as SMMANY_ind,
               min(days_since_index) as SMMANY_since_index
        from smm_post_delivery
        group by person_id, event_id
    )
    select e.*,
           case when e.age_at_event >=10 and e.age_at_event <20 then 'agegrp1'
                when e.age_at_event >=20 and e.age_at_event <25 then 'agegrp2'
                when e.age_at_event >=25 and e.age_at_event <30 then 'agegrp3'
                when e.age_at_event >=30 and e.age_at_event <35 then 'agegrp4'
                when e.age_at_event >=35 and e.age_at_event <40 then 'agegrp5'
                else 'agegrp6' 
           end as agegrp_at_event,
           case when los >= 2 then 1 else 0 end as los2up_ind,
           case when los >= 3 then 1 else 0 end as los3up_ind,
           case when los >= 4 then 1 else 0 end as los4up_ind,
           case when los >= 5 then 1 else 0 end as los5up_ind,
           case when los >= 6 then 1 else 0 end as los6up_ind,
           case when los >= 7 then 1 else 0 end as los7up_ind,
           cs.censor_since_index,
           coalesce(datediff(e.death_date,e.event_start_date),cs.censor_since_index) as death_since_index,
           coalesce(a.SMMANY_ind,0) as SMMANY_ind,
           coalesce(a.SMMANY_since_index,cs.censor_since_index) as SMMANY_since_index,
           coalesce(s.AMI_since_index,cs.censor_since_index) as AMI_since_index,
           IF(s.AMI_since_index IS NOT NULL, 1, 0) AMI_ind,
           coalesce(s.ANE_since_index,cs.censor_since_index) as ANE_since_index,
           IF(s.ANE_since_index IS NOT NULL, 1, 0) ANE_ind,
           coalesce(s.ARF_since_index,cs.censor_since_index) as ARF_since_index,
           IF(s.ARF_since_index IS NOT NULL, 1, 0) ARF_ind,
           coalesce(s.ARDS_since_index,cs.censor_since_index) as ARDS_since_index,
           IF(s.ARDS_since_index IS NOT NULL, 1, 0) ARDS_ind,
           coalesce(s.AFE_since_index,cs.censor_since_index) as AFE_since_index,
           IF(s.AFE_since_index IS NOT NULL, 1, 0) AFE_ind,
           coalesce(s.CAVF_since_index,cs.censor_since_index) as CAVF_since_index,
           IF(s.CAVF_since_index IS NOT NULL, 1, 0) CAVF_ind,
           coalesce(s.COCR_since_index,cs.censor_since_index) as COCR_since_index,
           IF(s.COCR_since_index IS NOT NULL, 1, 0) COCR_ind,
           coalesce(s.DIC_since_index,cs.censor_since_index) as DIC_since_index,
           IF(s.DIC_since_index IS NOT NULL, 1, 0) DIC_ind,
           coalesce(s.ECL_since_index,cs.censor_since_index) as ECL_since_index,
           IF(s.ECL_since_index IS NOT NULL, 1, 0) ECL_ind,
           coalesce(s.HF_since_index,cs.censor_since_index) as HF_since_index,
           IF(s.HF_since_index IS NOT NULL, 1, 0) HF_ind,
           coalesce(s.PCD_since_index,cs.censor_since_index) as PCD_since_index,
           IF(s.PCD_since_index IS NOT NULL, 1, 0) PCD_ind,
           coalesce(s.PE_since_index,cs.censor_since_index) as PE_since_index,
           IF(s.PE_since_index IS NOT NULL, 1, 0) PE_ind,
           coalesce(s.SAC_since_index,cs.censor_since_index) as SAC_since_index,
           IF(s.SAC_since_index IS NOT NULL, 1, 0) SAC_ind,
           coalesce(s.SEP_since_index,cs.censor_since_index) as SEP_since_index,
           IF(s.SEP_since_index IS NOT NULL, 1, 0) SEP_ind,
           coalesce(s.SSH_since_index,cs.censor_since_index) as SSH_since_index,
           IF(s.SSH_since_index IS NOT NULL, 1, 0) SSH_ind,
           coalesce(s.SCC_since_index,cs.censor_since_index) as SCC_since_index,
           IF(s.SCC_since_index IS NOT NULL, 1, 0) SCC_ind,
           coalesce(s.ATE_since_index,cs.censor_since_index) as ATE_since_index,
           IF(s.ATE_since_index IS NOT NULL, 1, 0) ATE_ind,
           coalesce(s.BPT_since_index,cs.censor_since_index) as BPT_since_index,
           IF(s.BPT_since_index IS NOT NULL, 1, 0) BPT_ind,
           coalesce(s.HYS_since_index,cs.censor_since_index) as HYS_since_index,
           IF(s.HYS_since_index IS NOT NULL, 1, 0) HYS_ind,
           coalesce(s.TT_since_index,cs.censor_since_index) as TT_since_index,
           IF(s.TT_since_index IS NOT NULL, 1, 0) TT_ind,
           coalesce(s.VEN_since_index,cs.censor_since_index) as VEN_since_index,
           IF(s.VEN_since_index IS NOT NULL, 1, 0) VEN_ind,
           cmb.hist_HEP_since_index,
           IF(cmb.hist_HEP_since_index IS NOT NULL, 1, 0) hist_HEP_ind,
           cmb.hist_IHD_since_index,
           IF(cmb.hist_IHD_since_index IS NOT NULL, 1, 0) hist_IHD_ind,
           cmb.hist_AST_since_index,
           IF(cmb.hist_AST_since_index IS NOT NULL, 1, 0) hist_AST_ind,
           cmb.hist_LIV_since_index,
           IF(cmb.hist_LIV_since_index IS NOT NULL, 1, 0) hist_LIV_ind,
           cmb.hist_AFIB_since_index,
           IF(cmb.hist_AFIB_since_index IS NOT NULL, 1, 0) hist_AFIB_ind,
           cmb.hist_STR_since_index,
           IF(cmb.hist_STR_since_index IS NOT NULL, 1, 0) hist_STR_ind,
           cmb.hist_CKD_since_index,
           IF(cmb.hist_CKD_since_index IS NOT NULL, 1, 0) hist_CKD_ind,
           cmb.hist_COPD_since_index,
           IF(cmb.hist_COPD_since_index IS NOT NULL, 1, 0) hist_COPD_ind,
           cmb.hist_HTN_since_index,
           IF(cmb.hist_HTN_since_index IS NOT NULL, 1, 0) hist_HTN_ind,
           cmb.hist_HF_since_index,
           IF(cmb.hist_HF_since_index IS NOT NULL, 1, 0) hist_HF_ind,
           cmb.hist_AIDS_since_index,
           IF(cmb.hist_AIDS_since_index IS NOT NULL, 1, 0) hist_AIDS_ind,
           cmb.hist_HLD_since_index,
           IF(cmb.hist_HLD_since_index IS NOT NULL, 1, 0) hist_HLD_ind,
           cmb.hist_PVD_since_index,
           IF(cmb.hist_PVD_since_index IS NOT NULL, 1, 0) hist_PVD_ind,
           cmb.hist_RA_since_index,
           IF(cmb.hist_RA_since_index IS NOT NULL, 1, 0) hist_RA_ind,
           cmb.hist_AD_since_index,
           IF(cmb.hist_AD_since_index IS NOT NULL, 1, 0) hist_AD_ind,
           cmb.hist_DM_since_index,
           IF(cmb.hist_DM_since_index IS NOT NULL, 1, 0) hist_DM_ind,
           cmb.hist_COVID_since_index,
           IF(cmb.hist_COVID_since_index IS NOT NULL, 1, 0) hist_COVID_ind,
           cmb.hist_SUB_since_index,
           IF(cmb.hist_SUB_since_index IS NOT NULL, 1, 0) hist_SUB_ind,
           cmb.hist_ALC_since_index,
           IF(cmb.hist_ALC_since_index IS NOT NULL, 1, 0) hist_ALC_ind
    from delivery_elig_tbl1 e
    left join smm_any a 
    on e.person_id = a.person_id and e.event_id = a.event_id
    left join smm_post_delivery_wide s 
    on e.person_id = s.person_id and e.event_id = s.event_id
    left join delivery_elig_censor cs
    on e.person_id = cs.person_id and e.event_id = cs.event_id
    left join comorb_pre_delivery_wide cmb 
    on e.person_id = cmb.person_id and e.event_id = cmb.event_id 
    where cs.censor_since_index > 0  
''').cache()
# delivery_elig_smm.createOrReplaceTempView("delivery_elig_smm")
delivery_elig_smm.write.mode('overwrite').saveAsTable("delivery_elig_smm")
delivery_elig_smm.first()

Row(person_id=11567, event_id=1, event_start_date=datetime.date(2024, 3, 4), event_end_date=datetime.date(2024, 3, 5), delivery_type='d_v', los=1, visit_occurrence_id=17454747617679, year_of_birth=1990, age_at_event=34, race_source_value='American Indian or Alaska Native', ethnicity_source_value='Non-Hispanic', location_id=23, care_site_id=101, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', death_date=None, death_ind=0, delivery_idx=1, agegrp_at_event='agegrp5', los2up_ind=0, los3up_ind=0, los4up_ind=0, los5up_ind=0, los6up_ind=0, los7up_ind=0, censor_since_index=172, death_since_index=172, SMMANY_ind=0, SMMANY_since_index=172, AMI_since_index=172, AMI_ind=0, ANE_since_index=172, ANE_ind=0, ARF_since_index=172, ARF_ind=0, ARDS_since_index=172, ARDS_ind=0, AFE_since_index=172, AFE_ind=0, CAVF_since_index=172, CAVF_ind=0, COCR_since_index=172, COCR_ind=0, DIC_since_index=172, DIC_ind=0, ECL_since_index=172, ECL_ind=0, HF_since_index=172, HF_ind=0, PCD_since_index=172, PCD_ind=0, PE_since_index=172, PE_ind=0, SAC_since_index=172, SAC_ind=0, SEP_since_index=172, SEP_ind=0, SSH_since_index=172, SSH_ind=0, SCC_since_index=172, SCC_ind=0, ATE_since_index=172, ATE_ind=0, BPT_since_index=172, BPT_ind=0, HYS_since_index=172, HYS_ind=0, TT_since_index=172, TT_ind=0, VEN_since_index=172, VEN_ind=0, hist_HEP_since_index=None, hist_HEP_ind=0, hist_IHD_since_index=None, hist_IHD_ind=0, hist_AST_since_index=-351, hist_AST_ind=1, hist_LIV_since_index=None, hist_LIV_ind=0, hist_AFIB_since_index=None, hist_AFIB_ind=0, hist_STR_since_index=None, hist_STR_ind=0, hist_CKD_since_index=None, hist_CKD_ind=0, hist_COPD_since_index=None, hist_COPD_ind=0, hist_HTN_since_index=None, hist_HTN_ind=0, hist_HF_since_index=None, hist_HF_ind=0, hist_AIDS_since_index=None, hist_AIDS_ind=0, hist_HLD_since_index=None, hist_HLD_ind=0, hist_PVD_since_index=None, hist_PVD_ind=0, hist_RA_since_index=None, hist_RA_ind=0, hist_AD_since_index=None, hist_AD_ind=0, hist_DM_since_index=None, hist_DM_ind=0, hist_COVID_since_index=-845, hist_COVID_ind=1, hist_SUB_since_index=None, hist_SUB_ind=0, hist_ALC_since_index=None, hist_ALC_ind=0)

In [None]:
delivery_elig_init_smm = spark.sql('''
    with cte as (
        select percentile(los,0.9) as los_90pct
        from delivery_elig_smm
    )
    select a.*, 
           case when a.SMMANY_ind = 1 and a.los>=cte.los_90pct then 1 else 0 end as SMMANY90PCT_ind,
           case when a.SMMANY_ind = 1 and a.los>=cte.los_90pct then SMMANY_since_index else censor_since_index end as SMMANY90PCT_since_index
    from delivery_elig_smm a
    cross join cte
    where a.delivery_idx = 1 and 
          a.delivery_type in ('d_v','d_c')
''').cache()
# delivery_elig_init_smm.createOrReplaaceTempView("delivery_elig_init_smm")
delivery_elig_init_smm.write.mode('overwrite').saveAsTable("delivery_elig_init_smm")
delivery_elig_init_smm.first()

Row(person_id=1047, event_id=1, event_start_date=datetime.date(2022, 6, 5), event_end_date=datetime.date(2022, 6, 8), delivery_type='d_v', los=3, visit_occurrence_id=17463337533175, year_of_birth=1996, age_at_event=26, race_source_value='White', ethnicity_source_value='Non-Hispanic', location_id=23, care_site_id=101, care_site_source_value='1', bed_size='200-299', speciality='Acute Care Hospital', segment='Hospital', zip_code='8', death_date=None, death_ind=0, delivery_idx=1, agegrp_at_event='agegrp4', los2up_ind=1, los3up_ind=1, los4up_ind=0, los5up_ind=0, los6up_ind=0, los7up_ind=0, censor_since_index=51, death_since_index=51, SMMANY_ind=0, SMMANY_since_index=51, AMI_since_index=51, AMI_ind=0, ANE_since_index=51, ANE_ind=0, ARF_since_index=51, ARF_ind=0, ARDS_since_index=51, ARDS_ind=0, AFE_since_index=51, AFE_ind=0, CAVF_since_index=51, CAVF_ind=0, COCR_since_index=51, COCR_ind=0, DIC_since_index=51, DIC_ind=0, ECL_since_index=51, ECL_ind=0, HF_since_index=51, HF_ind=0, PCD_since_index=51, PCD_ind=0, PE_since_index=51, PE_ind=0, SAC_since_index=51, SAC_ind=0, SEP_since_index=51, SEP_ind=0, SSH_since_index=51, SSH_ind=0, SCC_since_index=51, SCC_ind=0, ATE_since_index=51, ATE_ind=0, BPT_since_index=51, BPT_ind=0, HYS_since_index=51, HYS_ind=0, TT_since_index=51, TT_ind=0, VEN_since_index=51, VEN_ind=0, hist_HEP_since_index=None, hist_HEP_ind=0, hist_IHD_since_index=None, hist_IHD_ind=0, hist_AST_since_index=None, hist_AST_ind=0, hist_LIV_since_index=None, hist_LIV_ind=0, hist_AFIB_since_index=None, hist_AFIB_ind=0, hist_STR_since_index=None, hist_STR_ind=0, hist_CKD_since_index=None, hist_CKD_ind=0, hist_COPD_since_index=None, hist_COPD_ind=0, hist_HTN_since_index=-24, hist_HTN_ind=1, hist_HF_since_index=None, hist_HF_ind=0, hist_AIDS_since_index=None, hist_AIDS_ind=0, hist_HLD_since_index=None, hist_HLD_ind=0, hist_PVD_since_index=None, hist_PVD_ind=0, hist_RA_since_index=None, hist_RA_ind=0, hist_AD_since_index=None, hist_AD_ind=0, hist_DM_since_index=None, hist_DM_ind=0, hist_COVID_since_index=None, hist_COVID_ind=0, hist_SUB_since_index=None, hist_SUB_ind=0, hist_ALC_since_index=None, hist_ALC_ind=0, SMMANY90PCT_ind=0, SMMANY90PCT_since_index=51)

In [None]:
spark.sql('''
    select count(distinct person_id), count(*)
    from delivery_elig_init_smm
''').toPandas()