In [2]:
%load_ext autoreload 
%autoreload 2
import os
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.sparse import coo_matrix
from scipy.sparse import save_npz

from mimic_helper_fs import get_ids_with_icd_codes, get_ids_with_kws
from mimic_helper_fs import get_coocurring_symptoms_codes, get_coocurring_symptoms_kws
from mimic_helper_fs import get_icd_code_long_title

from mimic_paths import admissions_path, patients_path
from mimic_paths import english_names_path, hosp_diagnoses_path, ed_diagnoses_path

from ipv_codes import SUSPICIOUS_SYMPTOMS_ICD_CODES

pd.set_option('max_rows', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 80)

In [3]:
english_names = pd.read_csv(english_names_path)
diagnoses = pd.read_csv(hosp_diagnoses_path)
ed_diagnoses = pd.read_csv(ed_diagnoses_path)

admissions = pd.read_csv(admissions_path)
patients = pd.read_csv(patients_path)

# Filter out men, children, and patients *only* in hospital

In [4]:
sid_gender_map = dict(zip(patients.subject_id, patients.gender))
sid_age_map = dict(zip(patients.subject_id, patients.anchor_age))
sid_ethnicity_map = dict(zip(admissions.subject_id, admissions.ethnicity))

In [6]:
diagnoses['anchor_age'] = diagnoses['subject_id'].map(sid_age_map)
diagnoses['gender'] = diagnoses['subject_id'].map(sid_gender_map)
diagnoses['ethnicity'] = diagnoses['subject_id'].map(sid_ethnicity_map)

In [None]:
ed_admitted_patients = list(admissions[admissions['admission_location'] == 'EMERGENCY ROOM']['hadm_id'])

In [7]:
with open('./valid_codes.ob', 'rb') as fp:
    code_list = pickle.load(fp)
print("# of ICD codes that appear > 10 times: ", len(code_list))

5544

In [8]:
diagnoses = diagnoses[diagnoses['gender'] == 'F']
diagnoses = diagnoses[diagnoses['anchor_age'] > 18]
diagnoses = diagnoses[diagnoses['hadm_id'].isin(ed_admitted_patients)]
diagnoses = diagnoses[diagnoses['icd_code'].isin(code_list)]

all_hadm_ids = sorted(list(set(diagnoses['hadm_id'])))
all_icd_codes = sorted(list(set(diagnoses['icd_code'])))

code_to_index = {c: i for i,c in enumerate(all_icd_codes)}
hadm_id_to_index = {hadm_id: i for i, hadm_id in enumerate(all_hadm_ids)}

print("# of Patients: ", len(set(diagnoses['subject_id'])))
print("# of Individual Stays: ", len(set(diagnoses['hadm_id'])))
print("# of Unique ICD Codes: ", len(all_icd_codes))


# of Patients:  62594
# of Individual Stays:  124086
# of Unique ICD Codes:  5544


In [9]:
np.max(list(hadm_id_to_index.values())), len(list(code_to_index.values()))

(124085, 5544)

# Create one-hot encoded features

In [10]:
one_hot = pd.get_dummies(diagnoses['icd_code'], sparse=True)
hadm_one_hot = pd.concat([diagnoses['hadm_id'], one_hot], axis=1)

In [11]:
diagnoses['icd_code_idx'] = diagnoses['icd_code'].map(code_to_index)
diagnoses['hadm_id_idx'] = diagnoses['hadm_id'].map(hadm_id_to_index)

In [12]:
# Write out one-hot features in coordinate format (helpful since matrix is very sparse)
row_coords = np.array(diagnoses['hadm_id_idx'])
col_coords = np.array(diagnoses['icd_code_idx'])
vals = np.ones(len(col_coords))

n_rows = np.max(row_coords) + 1
n_cols = np.max(col_coords) + 1

# Dummy feature for intercept
intercept_row_coords = np.array(list(range(n_rows)))
intercept_col_coords = [n_cols for i in range(n_rows)]
intercept_vals = np.ones(n_rows)

# Combine features & dummy feature for intercept
row_coords = np.concatenate([row_coords, intercept_row_coords])
col_coords = np.concatenate([col_coords, intercept_col_coords])
vals = np.concatenate([vals, intercept_vals])

# Create sparse matrix
jj = coo_matrix((vals, (row_coords, col_coords)))

In [15]:
len(SUSPICIOUS_SYMPTOMS_ICD_CODES)

100

In [13]:
sus_icd_code_idxs = []
for c in SUSPICIOUS_SYMPTOMS_ICD_CODES:
    if c in code_to_index: 
        sus_icd_code_idxs.append(code_to_index[c])

print(sus_icd_code_idxs)
# for c in ['920']:
#     if c in code_to_index: 
#         sus_icd_code_idxs.append(code_to_index[c])
#     else:
#         print("Code is not in code to index??", c)

classifier_weights = np.zeros(len(all_icd_codes) + 1)
classifier_weights[sus_icd_code_idxs] = 4
classifier_weights = np.expand_dims(classifier_weights, 1)
classifier_weights[-1] = -3
# Why are there no ICD-10 codes in here?

[2614, 2177, 2557, 2570, 4625, 4656, 4626, 4627]


In [14]:
# Count number of suspicious patients 
kk = jj.dot(classifier_weights)
min_symptoms_val = np.min(kk)
max_symptoms_val = np.max(kk)
r = (kk > min_symptoms_val).astype(int)
print("Range of # of symptoms: ", max_symptoms_val, min_symptoms_val)
print("# Suspicious: ", len(np.where(kk > min_symptoms_val)[0]))
print("# Patients with 0 Suspicious Symptoms: ", len(np.where(kk == min_symptoms_val)[0]))
print("# Patients with 1 Suspicious Symptoms: ", len(np.where(kk == min_symptoms_val +4)[0]))
print("# Patients with 2 Suspicious Symptoms: ", len(np.where(kk == min_symptoms_val +8)[0]))
print("# Patients with 3 Suspicious Symptoms: ", len(np.where(kk == min_symptoms_val +16)[0]))

print("# Total: ", len(kk))

# Transform into p(y) given fixed decision rule
p_y = 1/(1 + np.exp(- kk))
print("Positive probabilities: ", sorted(list(set(np.squeeze(p_y)))))


Range of # of symptoms:  9.0 -3.0
# Suspicious:  1470
# Patients with 0 Suspicious Symptoms:  122616
# Patients with 1 Suspicious Symptoms:  1412
# Patients with 2 Suspicious Symptoms:  55
# Patients with 3 Suspicious Symptoms:  0
# Total:  124086
Positive probabilities:  [0.04742587317756678, 0.7310585786300049, 0.9933071490757153, 0.9998766054240137]


In [14]:
# Generate 5 splits of data, regenerating y each time based on p_y
data_dir = "./data/semisynthetic/ipv/"
n_splits = 5

for split_num in range(n_splits):
    y = (np.random.random(p_y.shape) < p_y).astype(int)
    #y = y * r # if r = 0, then y = 0 (no suspicious symptoms --> negative)
    print(np.sum(y))
    split_dir = data_dir + str(split_num) + '/'
    if not os.path.exists(split_dir):
        os.makedirs(split_dir)
    
    np.savetxt(split_dir + 'feat_names', all_icd_codes, fmt="%s")
    np.savetxt(split_dir + 'row_names', all_hadm_ids)
    np.savetxt(split_dir + 'suspicious_labels', r)
    np.savetxt(split_dir + 'positive_labels', y)
    np.savetxt(split_dir + 'p_y', p_y)
    np.savetxt(split_dir + 'true_clf_weights', classifier_weights)
    save_npz(split_dir + 'vals.npz', jj)

6849
6955
6856
6828
6855


# Generate real labels

In [12]:
# HAHA 