In [1]:
%load_ext autoreload 
%autoreload 2
import os
import pickle
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from scipy.sparse import coo_matrix
from scipy.sparse import save_npz

from mimic_helper_fs import get_ids_with_icd_codes, get_ids_with_kws
from mimic_helper_fs import get_icd_code_long_title
from mimic_helper_fs import get_coocurring_symptoms_codes, get_coocurring_symptoms_kws
from mimic_paths import ed_path, hosp_path, admissions_path, patients_path
from mimic_paths import ed_diagnoses_path, hosp_diagnoses_path, english_names_path

from ipv_codes import SUSPICIOUS_SYMPTOMS_ICD_CODES

pd.set_option('max_rows', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 80)

## Load data

In [2]:
english_names = pd.read_csv(english_names_path)
diagnoses = pd.read_csv(hosp_diagnoses_path)
ed_diagnoses = pd.read_csv(ed_diagnoses_path)

## Filter out appropriate patients.

This depends upon the disease. For IPV, we filter out men and children because 1) IPV is poorly understood in men, and its likely that the the way it manifests differs by gender, and 2) the line between child abuse and domestic violence for minors is blurry and might require a different approach. 

In [3]:
with open('./valid_codes.ob', 'rb') as fp:
    code_list = pickle.load(fp)
len(code_list)

5544

In [5]:
admissions = pd.read_csv(mimic_iv_data_path + 'core/admissions.csv.gz')
patients = pd.read_csv(mimic_iv_data_path + 'core/patients.csv.gz')
sid_gender_map = dict(zip(patients.subject_id, patients.gender))
sid_age_map = dict(zip(patients.subject_id, patients.anchor_age))
sid_ethnicity_map = dict(zip(admissions.subject_id, admissions.ethnicity))

In [6]:
ed_admitted_patients = list(admissions[admissions['admission_location'] == 'EMERGENCY ROOM']['hadm_id'])

In [7]:
diagnoses['anchor_age'] = diagnoses['subject_id'].map(sid_age_map)
diagnoses['gender'] = diagnoses['subject_id'].map(sid_gender_map)
diagnoses['ethnicity'] = diagnoses['subject_id'].map(sid_ethnicity_map)

In [8]:
diagnoses = diagnoses[diagnoses['gender'] == 'F']
diagnoses = diagnoses[diagnoses['anchor_age'] > 18]
diagnoses = diagnoses[diagnoses['hadm_id'].isin(ed_admitted_patients)]

all_hadm_ids = sorted(list(set(diagnoses['hadm_id'])))
all_icd_codes = sorted(list(set(diagnoses['icd_code'])))

code_to_index = {c: i for i,c in enumerate(all_icd_codes)}
hadm_id_to_index = {hadm_id: i for i, hadm_id in enumerate(all_hadm_ids)}

print("# of Patients: ", len(set(diagnoses['subject_id'])))
print("# of Individual Stays: ", len(set(diagnoses['hadm_id'])))
print("# of Unique ICD Codes: ", len(all_icd_codes))

assert (np.max(list(hadm_id_to_index.values())) == len(list(hadm_id_to_index.values())))

# of Patients:  62712
# of Individual Stays:  124239
# of Unique ICD Codes:  15699


# Identify relevant codes

In [10]:
# These codes were generated by the Suspicious Symptoms notebook and correspond to 
# all ICD codes that refer to head, neck, and face injuries
disease_prefix = 'ipv'
suspicious_symptoms_ICD_codes = SUSPICIOUS_SYMPTOMS_ICD_CODES

# Create one-hot encoded features

In [11]:
one_hot = pd.get_dummies(diagnoses['icd_code'], sparse=True)
hadm_one_hot = pd.concat([diagnoses['hadm_id'], one_hot], axis=1)

In [12]:
diagnoses['icd_code_idx'] = diagnoses['icd_code'].map(code_to_index)
diagnoses['hadm_id_idx'] = diagnoses['hadm_id'].map(hadm_id_to_index)

In [13]:
# Write out one-hot features in coordinate format (helpful since matrix is very sparse)
row_coords = np.array(diagnoses['hadm_id_idx'])
col_coords = np.array(diagnoses['icd_code_idx'])
vals = np.ones(len(col_coords))

n_rows = np.max(row_coords) + 1
n_cols = np.max(col_coords) + 1

# Dummy feature for intercept
intercept_row_coords = np.array(list(range(n_rows)))
intercept_col_coords = [n_cols for i in range(n_rows)]
intercept_vals = np.ones(n_rows)

# Combine features & dummy feature for intercept
row_coords = np.concatenate([row_coords, intercept_row_coords])
col_coords = np.concatenate([col_coords, intercept_col_coords])
vals = np.concatenate([vals, intercept_vals])

# Create sparse matrix
jj = coo_matrix((vals, (row_coords, col_coords)))

In [14]:
# Construct ideal classifier weights
sus_icd_code_idxs = []
for c in suspicious_symptoms_ICD_codes:
    if c in code_to_index: 
        sus_icd_code_idxs.append(code_to_index[c])

classifier_weights = np.zeros(len(all_icd_codes) + 1)
classifier_weights[sus_icd_code_idxs] = 1
classifier_weights = np.expand_dims(classifier_weights, 1)
classifier_weights[-1] = 0

Code is not in code to index:  S100XXD
Code is not in code to index:  S100XXS
Code is not in code to index:  S1080XD
Code is not in code to index:  S1080XS
Code is not in code to index:  S1081XA
Code is not in code to index:  S1081XD
Code is not in code to index:  S1081XS
Code is not in code to index:  S1090XA
Code is not in code to index:  S1090XD
Code is not in code to index:  S1090XS
Code is not in code to index:  S1091XD
Code is not in code to index:  S1091XS
Code is not in code to index:  S1093XD
Code is not in code to index:  S1093XS
Code is not in code to index:  S0000XA
Code is not in code to index:  S0000XD
Code is not in code to index:  S0000XS
Code is not in code to index:  S0001XD
Code is not in code to index:  S0001XS
Code is not in code to index:  S0003XS
Code is not in code to index:  S0990XD
Code is not in code to index:  S0990XS
Code is not in code to index:  S0991XD
Code is not in code to index:  S0991XS
Code is not in code to index:  S0993XD
Code is not in code to in

In [15]:
# Count number of suspicious patients 
kk = jj.dot(classifier_weights)
min_symptoms_val = np.min(kk)
max_symptoms_val = np.max(kk)
r = (kk > min_symptoms_val).astype(int)
print("Range of # of symptoms: ", max_symptoms_val, min_symptoms_val)
print("# Positive: ", len(np.where(kk > min_symptoms_val)[0]))
print("# Patients with 0 Indicative Symptoms: ", len(np.where(kk == min_symptoms_val)[0]))
print("# Patients with 1 Indicative Symptoms: ", len(np.where(kk == min_symptoms_val +1)[0]))
print("# Patients with 2 Indicative Symptoms: ", len(np.where(kk == min_symptoms_val +2)[0]))
print("# Patients with 3 Indicative Symptoms: ", len(np.where(kk == min_symptoms_val +3)[0]))

print("# Total: ", len(kk))

Range of # of symptoms:  3.0 0.0
# Positive:  1518
# Patients with 0 Indicative Symptoms:  122721
# Patients with 1 Indicative Symptoms:  1453
# Patients with 2 Indicative Symptoms:  61
# Patients with 3 Indicative Symptoms:  4
# Total:  124239


In [84]:
# For real data, there are no splits for differently generated y, so 
# all data is saved under Split 0. 
data_dir = "./data/real/hospital/" + disease_prefix + '/'
split_num = 0
y = r
split_dir = data_dir + str(split_num) + '/'
if not os.path.exists(split_dir):
    os.makedirs(split_dir)

np.savetxt(split_dir + 'feat_names', all_icd_codes, fmt="%s")
np.savetxt(split_dir + 'row_names', all_hadm_ids)
np.savetxt(split_dir + 'suspicious_labels', r)
np.savetxt(split_dir + 'positive_labels', y)
np.savetxt(split_dir + 'p_y', p_y)
np.savetxt(split_dir + 'true_clf_weights', classifier_weights)
save_npz(split_dir + 'vals.npz', jj)

In [None]:
n_splits = 5
for i in range(n_splits):
    

# we want to get all the subject ids
# split the subject ids into train and test
# get the corresponding hadm_ids for those subject_ids
# write out the data for those