In [4]:
%load_ext autoreload 
%autoreload 2
import numpy as np
import pandas as pd

from scipy.sparse import coo_matrix
from scipy.sparse import save_npz

from mimic_helper_fs import get_icd_code_long_title
from mimic_helper_fs import get_ids_with_icd_codes, get_ids_with_kws
from mimic_paths import english_names_path, hosp_diagnoses_path, ed_diagnoses_path
from mimic_paths import admissions_path, patients_path


np.random.seed(42)
pd.set_option('max_rows', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 80)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Code to load each table

In [8]:
english_names = pd.read_csv(english_names_path)    # https://mimic.mit.edu/docs/iv/modules/hosp/d_icd_diagnoses/
hosp_diagnoses = pd.read_csv(hosp_diagnoses_path)  # https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/
ed_diagnoses = pd.read_csv(ed_diagnoses_path)      # https://mimic.mit.edu/docs/iv/modules/ed/diagnosis/
admissions = pd.read_csv(admissions_path)          # https://mimic.mit.edu/docs/iv/modules/core/admissions/
patients = pd.read_csv(patients_path)              # https://mimic.mit.edu/docs/iv/modules/core/patients/

# Considering only diagnoses made in the hospital -- the demographics for patients
# seen in the ED, but not admitted to the hospital, aren't available  yet (but soon will be!)
# the 'triage' table, under the ED module, does contain oxygen readings: 
# https://mimic.mit.edu/docs/iv/modules/ed/triage/
diagnoses = hosp_diagnoses

## Filter for women, >18 years old

In [9]:
sid_gender_map = dict(zip(patients.subject_id, patients.gender))
sid_age_map = dict(zip(patients.subject_id, patients.anchor_age))
sid_ethnicity_map = dict(zip(admissions.subject_id, admissions.ethnicity))
hadm_id_to_subject_id = dict(zip(diagnoses.hadm_id, diagnoses.subject_id))

diagnoses['anchor_age'] = diagnoses['subject_id'].map(sid_age_map)
diagnoses['gender'] = diagnoses['subject_id'].map(sid_gender_map)
diagnoses['ethnicity'] = diagnoses['subject_id'].map(sid_ethnicity_map)

In [10]:
# You may want to filter out rare codes, 
# to reduce the dimensionality of the one-hot encoding:
code_occurrence_thresh = 10
code_counts =  diagnoses.value_counts("icd_code")
code_counts = code_counts.reset_index()
code_list = list(code_counts[code_counts[0] >= code_occurrence_thresh]['icd_code']) 
code_list = sorted(list(set(code_list)))
print("# of ICD codes that appear > " + str(code_occurrence_thresh) + " times: ", len(code_list))

# of ICD codes that appear > 10 times:  11974


In [6]:
diagnoses = diagnoses[diagnoses['anchor_age'] > 17]
diagnoses = diagnoses[diagnoses['gender'] == 'F']
diagnoses = diagnoses[diagnoses['icd_code'].isin(code_list)]

all_hadm_ids = sorted(list(set(diagnoses['hadm_id'])))
all_icd_codes = sorted(list(set(diagnoses['icd_code'])))
all_subject_ids = [hadm_id_to_subject_id[hadm_id] for hadm_id in all_hadm_ids]

code_to_index = {c: i for i,c in enumerate(all_icd_codes)}
hadm_id_to_index = {hadm_id: i for i, hadm_id in enumerate(all_hadm_ids)}

print("# of Patients: ", len(set(diagnoses['subject_id'])))
print("# of Individual Stays: ", len(set(diagnoses['hadm_id'])))
print("# of Unique ICD Codes: ", len(all_icd_codes))

KeyError: 'anchor_age'

## Most common ICD-9 / ICD-10 codes among women, >18 years old

## One-hot encoding patient visits

In [None]:
one_hot = pd.get_dummies(diagnoses['icd_code'], sparse=True)
hadm_one_hot = pd.concat([diagnoses['hadm_id'], one_hot], axis=1)

diagnoses['icd_code_idx'] = diagnoses['icd_code'].map(code_to_index)
diagnoses['hadm_id_idx'] = diagnoses['hadm_id'].map(hadm_id_to_index)

In [None]:
# Write out one-hot features in coordinate format (helpful since matrix is very sparse)
row_coords = np.array(diagnoses['hadm_id_idx'])
col_coords = np.array(diagnoses['icd_code_idx'])
vals = np.ones(len(col_coords))

n_rows = np.max(row_coords) + 1
n_cols = np.max(col_coords) + 1

# Dummy feature for intercept
intercept_row_coords = np.array(list(range(n_rows)))
intercept_col_coords = [n_cols for i in range(n_rows)]
intercept_vals = np.ones(n_rows)

# Combine features & dummy feature for intercept
row_coords = np.concatenate([row_coords, intercept_row_coords])
col_coords = np.concatenate([col_coords, intercept_col_coords])
vals = np.concatenate([vals, intercept_vals])

# Create sparse matrix
jj = coo_matrix((vals, (row_coords, col_coords)))

##  Get name of a specific  ICD code