In [3]:
# How many 'gold label' patients do we have across the ED & hospital?

%load_ext autoreload 
%autoreload 2
import pandas as pd
import os

from mimic_helper_fs import get_ids_with_icd_codes, get_ids_with_kws
from mimic_helper_fs import get_coocurring_symptoms_codes, get_coocurring_symptoms_kws

from ipv_codes import NHAS_IPV_CODES, OREGON_IPV_CODES, USED_IPV_CODES, ICD10_IPV_CODES
from ipv_codes import GOLD_STANDARD_IPV_CODES_1, GOLD_STANDARD_IPV_CODES_2, GOLD_STANDARD_IPV_CODES_3, GOLD_STANDARD_IPV_CODES_4
from ipv_codes import KW_SETS, CODE_SETS
from ipv_codes import T74_CODES, T76_CODES, Y_CODES
pd.set_option('max_rows', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 80)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [4]:
mimic_iv_ed_data_path = '~/physionet.org/files/mimic-iv-ed/1.0/ed/'
mimic_iv_data_path = '~/physionet.org/files/mimiciv/1.0/'
hosp_data_path = mimic_iv_data_path + 'hosp/'

english_names = pd.read_csv(hosp_data_path + 'd_icd_diagnoses.csv.gz')
ed_diagnoses = pd.read_csv(mimic_iv_ed_data_path + 'diagnosis.csv.gz')
hosp_diagnoses = pd.read_csv(hosp_data_path + 'diagnoses_icd.csv.gz')

all_hosp_subject_ids = list(hosp_diagnoses['subject_id'])
all_ed_subject_ids = list(ed_diagnoses['subject_id'])
all_subject_ids = list(set(all_ed_subject_ids + all_hosp_subject_ids))

# Get label frequencies for a specific gold standard code set

In [5]:
import numpy as np

id_type = 'subject_id'
for GOLD_STANDARD_IPV_CODES in [GOLD_STANDARD_IPV_CODES_1, GOLD_STANDARD_IPV_CODES_2, 
                                GOLD_STANDARD_IPV_CODES_3, GOLD_STANDARD_IPV_CODES_4]:
    ed_ipv_ids =  get_ids_with_icd_codes(ed_diagnoses, id_type, GOLD_STANDARD_IPV_CODES)
    hosp_ipv_ids = get_ids_with_icd_codes(hosp_diagnoses, id_type, GOLD_STANDARD_IPV_CODES)

    ipv_patients = pd.DataFrame(all_subject_ids, columns=['subject_id'])
    ipv_patients['ed'] = ipv_patients['subject_id'].isin(all_ed_subject_ids)
    ipv_patients['hosp'] = ipv_patients['subject_id'].isin(all_hosp_subject_ids)
    ipv_patients['ed_and_hosp'] = ipv_patients['ed'] * ipv_patients['hosp']
    ipv_patients['ed_or_hosp'] = ipv_patients['ed'] | ipv_patients['hosp']

    ipv_patients['ed_ipv'] = ipv_patients['subject_id'].isin(list(ed_ipv_ids))
    ipv_patients['hosp_ipv'] = ipv_patients['subject_id'].isin(list(hosp_ipv_ids))
    ipv_patients['ipv'] = ipv_patients['ed_ipv'] | ipv_patients['hosp_ipv']

#     print("# of Unique Patients: ", len(ipv_patients))
#     print("# of Unique Patients in ED: ", ipv_patients['ed'].value_counts()[1])
#     print("# of Unique Patients in Hosp: ", ipv_patients['hosp'].value_counts()[1])
#     print("# of Unique Patients in Both: ", ipv_patients['ed_and_hosp'].value_counts()[1])


    print("# of IPV Patients in ED: ", ipv_patients['ed_ipv'].value_counts()[1])
    print("# of IPV Patients in Hosp: ", ipv_patients['hosp_ipv'].value_counts()[1])
    print("# of IPV Patients in Either: ", ipv_patients['ipv'].value_counts()[1])
    
    print("\np(y) in ED: ", ipv_patients['ed_ipv'].value_counts()[1]/ipv_patients['ed'].value_counts()[1])
    print("p(y) in Hosp: ", ipv_patients['hosp_ipv'].value_counts()[1]/ipv_patients['ed'].value_counts()[1])
    print("p(y) in Either: ", ipv_patients['ipv'].value_counts()[1]/ipv_patients['ed_or_hosp'].value_counts()[1])
    print('-----\n')
    # 20 patients just went to the ED, and not the hospital

# of IPV Patients in ED:  85
# of IPV Patients in Hosp:  208
# of IPV Patients in Either:  257

p(y) in ED:  0.0003926695524029067
p(y) in Hosp:  0.0009608854929388775
p(y) in Either:  0.00075091088119538
-----

# of IPV Patients in ED:  92
# of IPV Patients in Hosp:  245
# of IPV Patients in Either:  298

p(y) in ED:  0.0004250070449537343
p(y) in Hosp:  0.0011318122392789663
p(y) in Either:  0.0008707060023199348
-----

# of IPV Patients in ED:  92
# of IPV Patients in Hosp:  247
# of IPV Patients in Either:  300

p(y) in ED:  0.0004250070449537343
p(y) in Hosp:  0.001141051522864917
p(y) in Either:  0.000876549666765035
-----

# of IPV Patients in ED:  92
# of IPV Patients in Hosp:  264
# of IPV Patients in Either:  317

p(y) in ED:  0.0004250070449537343
p(y) in Hosp:  0.0012195854333454984
p(y) in Either:  0.000926220814548387
-----



In [6]:
import numpy as np
print("# of Unique Patients: ", len(ipv_patients))
print("# of Unique Patients in ED: ", ipv_patients['ed'].value_counts()[1])
print("# of Unique Patients in Hosp: ", ipv_patients['hosp'].value_counts()[1])
print("# of Unique Patients in Both: ", ipv_patients['ed_and_hosp'].value_counts()[1])


# of Unique Patients:  342251
# of Unique Patients in ED:  216467
# of Unique Patients in Hosp:  255106
# of Unique Patients in Both:  129322


In [7]:
print("# of IPV Patients in ED: ", ipv_patients['ed_ipv'].value_counts()[1])
print("# of IPV Patients in Hosp: ", ipv_patients['hosp_ipv'].value_counts()[1])
print("# of IPV Patients in Either: ", ipv_patients['ipv'].value_counts()[1])
# 20 patients just went to the ED, and not the hospital

# of IPV Patients in ED:  92
# of IPV Patients in Hosp:  264
# of IPV Patients in Either:  317


In [8]:
# Corresponds to a p_y of 156/342251 or
#156/342251
253/342251

0.0007392235523051796