In [2]:
%load_ext autoreload 
%autoreload 2
import pandas as pd
from tqdm.notebook import tqdm
import os
import json
import pickle

from mimic_helper_fs import get_icd_code_long_title
from mimic_helper_fs import get_icd_codes_with_prefix
from mimic_helper_fs import get_ids_with_icd_codes, get_ids_with_kws
from mimic_helper_fs import get_coocurring_symptoms_codes, get_coocurring_symptoms_kws
from mimic_paths import ed_path, hosp_path, admissions_path, patients_path

from ipv_codes import SUSPICIOUS_SYMPTOMS_ICD_CODES_PREFIXES,SUSPICIOUS_SYMPTOMS_ICD_CODES

pd.set_option('max_rows', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 80)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [3]:
english_names = pd.read_csv(hosp_path + 'd_icd_diagnoses.csv.gz')
ed_diagnoses = pd.read_csv(ed_path + 'diagnosis.csv.gz')
hosp_diagnoses = pd.read_csv(hosp_path + 'diagnoses_icd.csv.gz')

all_hosp_subject_ids = list(hosp_diagnoses['subject_id'])
all_ed_subject_ids = list(ed_diagnoses['subject_id'])
all_subject_ids = list(set(all_ed_subject_ids + all_hosp_subject_ids))
diagnoses = hosp_diagnoses

In [11]:
# Filter for patients admitted to the hospital after the ED; these are the patients
# we have demographic information for. 
ed_admitted_patients = list(admissions[admissions['admission_location'] == 'EMERGENCY ROOM']['hadm_id'])

admissions = pd.read_csv(admissions_path)
patients = pd.read_csv(patients_path)

sid_gender_map = dict(zip(patients.subject_id, patients.gender))
sid_age_map = dict(zip(patients.subject_id, patients.anchor_age))
sid_ethnicity_map = dict(zip(admissions.subject_id, admissions.ethnicity))
sid_marital_status_map = dict(zip(admissions.subject_id, admissions.marital_status))
sid_insurance_map = dict(zip(admissions.subject_id, admissions.insurance))



In [12]:
diagnoses['anchor_age'] = diagnoses['subject_id'].map(sid_age_map)
diagnoses['gender'] = diagnoses['subject_id'].map(sid_gender_map)
diagnoses['ethnicity'] = diagnoses['subject_id'].map(sid_ethnicity_map)
diagnoses['marital_status'] = diagnoses['subject_id'].map(sid_marital_status_map)
diagnoses['insurance'] = diagnoses['subject_id'].map(sid_insurance_map)

In [13]:
# Filter out men and children
# diagnoses = diagnoses[diagnoses['gender'] == 'F']
diagnoses = diagnoses[diagnoses['anchor_age'] > 18]
# diagnoses = diagnoses[diagnoses['hadm_id'].isin(ed_admitted_patients)]

In [14]:
# Number of subjects
n_admissions = len(set(diagnoses['hadm_id']))
print("# of Admissions: ", n_admissions)

# of Admissions:  457552


# Print counts and prevalence of *any* symptom

In [15]:
diagnoses = pd.merge(diagnoses, english_names, on=['icd_code', 'icd_version'])

In [16]:
diagnosis_counts = pd.DataFrame(diagnoses['icd_code'].value_counts().reset_index())
diagnosis_counts = diagnosis_counts.rename(columns={'icd_code': 'counts', 'index': 'icd_code'})
diagnosis_counts['prevalence'] = diagnosis_counts['counts']/n_admissions

In [17]:
code_name_map = dict(zip(english_names.icd_code, english_names.long_title))
diagnosis_counts['long_title'] = diagnosis_counts['icd_code'].map(code_name_map)

In [18]:
most_popular_codes_50 = list(diagnosis_counts.head(50)['icd_code'])
with open('./most_popular_codes_50.ob', 'wb') as fp:
    pickle.dump(most_popular_codes_50, fp)
diagnosis_counts.head(50)

Unnamed: 0,icd_code,counts,prevalence,long_title
0,4019,104051,0.227408,Unspecified essential hypertension
1,2724,68207,0.149069,Other and unspecified hyperlipidemia
2,I10,54687,0.119521,Essential (primary) hypertension
3,E785,51091,0.111662,"Hyperlipidemia, unspecified"
4,53081,49462,0.108101,Esophageal reflux
5,25000,43702,0.095513,"Diabetes mellitus without mention of complication, type II or unspecified ty..."
6,Z87891,40782,0.089131,Personal history of nicotine dependence
7,42731,37335,0.081597,Atrial fibrillation
8,4280,36875,0.080592,"Congestive heart failure, unspecified"
9,311,36706,0.080223,"Depressive disorder, not elsewhere classified"


In [13]:
# As you increase the code set, what % of the admissions do they describe?

# Print counts and prevalence of *suspicious* symptoms

In [14]:
# NOTE: Each count reflects any code with this *prefix*. This means that the count 
# can include initial encounter, sequela, etc.
id_type = 'hadm_id'
prevalence_dicts = []
for code in tqdm(SUSPICIOUS_SYMPTOMS_ICD_CODES):
    try:
        descr = get_icd_code_long_title(english_names, code)
        n_patients = len(get_ids_with_icd_codes(diagnoses, id_type, [code]))
        p_y = n_patients/len(set(diagnoses[id_type]))
        prevalence_dicts.append({'code': code, 'long_title': descr, 'prevalence': p_y,
                                 'count': n_patients})
    except:
        print(code)
prevalence_df = pd.DataFrame(prevalence_dicts)
prevalence_df.sort_values('prevalence', ascending=False)



  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,code,long_title,prevalence,count
4,920,"Contusion of face, scalp, and neck except eye(s)",0.004524,562
0,95901,"Head injury, unspecified",0.003147,391
1,7842,"Swelling, mass, or lump in head and neck",0.001883,234
3,9100,"Abrasion or friction burn of face, neck, and scalp except eye, without menti...",0.001409,175
34,S0990XA,"Unspecified injury of head, initial encounter",0.000732,91
31,S0003XA,"Contusion of scalp, initial encounter",0.000427,53
46,S0011XA,"Contusion of right eyelid and periocular area, initial encounter",0.000105,13
49,S0012XA,"Contusion of left eyelid and periocular area, initial encounter",9.7e-05,12
6,9947,Asphyxiation and strangulation,7.2e-05,9
28,S0001XA,"Abrasion of scalp, initial encounter",7.2e-05,9


In [15]:
# % of patients with suspicious codes
suspicious_hadm_ids = get_ids_with_icd_codes(diagnoses, id_type, SUSPICIOUS_SYMPTOMS_ICD_CODES)
n_with_suspicious_codes = len(suspicious_hadm_ids)
n_subjects = len(all_subject_ids)

print("% Suspicious patients: ", n_with_suspicious_codes/n_subjects)
print("# Suspicious patients: ", n_with_suspicious_codes)

% Suspicious patients:  0.004435341313831077
# Suspicious patients:  1518


In [16]:
all_codes = []
for code in SUSPICIOUS_SYMPTOMS_ICD_CODES_PREFIXES:
    
    codes = get_icd_codes_with_prefix(english_names, code)
    all_codes.extend(codes)
SUSPICIOUS_SYMPTOMS_ICD_CODES = all_codes
json.dumps(all_codes)

'["95901", "7842", "9108", "9100", "920", "E963", "9947", "S100XXA", "S100XXD", "S100XXS", "S1080XA", "S1080XD", "S1080XS", "S1081XA", "S1081XD", "S1081XS", "S1090XA", "S1090XD", "S1090XS", "S1091XA", "S1091XD", "S1091XS", "S1093XA", "S1093XD", "S1093XS", "S0000XA", "S0000XD", "S0000XS", "S0001XA", "S0001XD", "S0001XS", "S0003XA", "S0003XD", "S0003XS", "S0990XA", "S0990XD", "S0990XS", "S0991XA", "S0991XD", "S0991XS", "S0993XA", "S0993XD", "S0993XS", "S0010XA", "S0010XD", "S0010XS", "S0011XA", "S0011XD", "S0011XS", "S0012XA", "S0012XD", "S0012XS", "S00211", "S00211A", "S00211D", "S00211S", "S00212", "S00212A", "S00212D", "S00212S", "S00219", "S00219A", "S00219D", "S00219S", "S00401", "S00401A", "S00401D", "S00401S", "S00402", "S00402A", "S00402D", "S00402S", "S00411", "S00411A", "S00411D", "S00411S", "S00412", "S00412A", "S00412D", "S00412S", "S00431", "S00431A", "S00431D", "S00431S", "S00432", "S00432A", "S00432D", "S00432S", "S00511", "S00511A", "S00511D", "S00511S", "S00512", "S00512

In [17]:
print("# of ICD Codes total: ", len(set(english_names['icd_code'])))
print("# of ICD Codes in hospital: ", len(set(diagnoses['icd_code'])))

code_counts =  diagnoses.value_counts("icd_code")
code_counts = code_counts.reset_index()
print("# of ICD Codes appearing more than 10 times: ", len(code_counts[code_counts[0] >= 10]))

# of ICD Codes total:  109282
# of ICD Codes in hospital:  15699
# of ICD Codes appearing more than 10 times:  5544


In [18]:
valid_codes = list(code_counts[code_counts[0] >= 10]['icd_code']) 
valid_codes = sorted(list(set(valid_codes)))

In [19]:
with open('./valid_codes.ob', 'wb') as fp:
    pickle.dump(valid_codes, fp)

In [20]:
valid_code_str = json.dumps(valid_codes)

# Group-specific numbers

In [121]:
# How many suspicious patients in each ethnic group? 
# How many supsicious patients in each insurance group?
# How many suspicious patients in each age group?
set(diagnoses['ethnicity'])

{'AMERICAN INDIAN/ALASKA NATIVE',
 'ASIAN',
 'BLACK/AFRICAN AMERICAN',
 'HISPANIC/LATINO',
 'OTHER',
 'UNABLE TO OBTAIN',
 'UNKNOWN',
 'WHITE'}

In [133]:
married = diagnoses[diagnoses['marital_status'] == 'MARRIED']
single = diagnoses[diagnoses['marital_status'] == 'SINGLE']

subsets = [(married, 'Married'), (single, 'Single')]
for group, group_name in subsets:
    n_suspicious_hadm_ids = len(get_ids_with_icd_codes(group, id_type, SUSPICIOUS_SYMPTOMS_ICD_CODES))
    print("Group Name: ", group_name, "\t# Suspicious: ", n_suspicious_hadm_ids)

Group Name:  Married 	# Suspicious:  333
Group Name:  Single 	# Suspicious:  671


In [134]:
black = diagnoses[diagnoses['ethnicity'] == 'BLACK/AFRICAN AMERICAN']
white = diagnoses[diagnoses['ethnicity'] == 'WHITE']

subsets = [(black, 'Black'), (white, 'White')]
for group, group_name in subsets:
    n_suspicious_hadm_ids = len(get_ids_with_icd_codes(group, id_type, SUSPICIOUS_SYMPTOMS_ICD_CODES))
    print("Group Name: ", group_name, "\t# Suspicious: ", n_suspicious_hadm_ids)

Group Name:  Black 	# Suspicious:  232
Group Name:  White 	# Suspicious:  1055


In [135]:
married = diagnoses[diagnoses['marital_status'] == 'MARRIED']
single = diagnoses[diagnoses['marital_status'] == 'SINGLE']

subsets = [(married, 'Married'), (single, 'Single')]
for group, group_name in subsets:
    n_suspicious_hadm_ids = len(get_ids_with_icd_codes(group, id_type, ['920']))
    print("Group Name: ", group_name, "\t# Suspicious: ", n_suspicious_hadm_ids)

Group Name:  Married 	# Suspicious:  122
Group Name:  Single 	# Suspicious:  201


In [136]:
black = diagnoses[diagnoses['ethnicity'] == 'BLACK/AFRICAN AMERICAN']
white = diagnoses[diagnoses['ethnicity'] == 'WHITE']

subsets = [(black, 'Black'), (white, 'White')]
for group, group_name in subsets:
    n_suspicious_hadm_ids = len(get_ids_with_icd_codes(group, id_type, ['920']))
    print("Group Name: ", group_name, "\t# Suspicious: ", n_suspicious_hadm_ids)

Group Name:  Black 	# Suspicious:  53
Group Name:  White 	# Suspicious:  419
