# 

## packages installation 

In [1]:
!pip install pyhealth



## Load dataset

description on each table here:

1. DIAGNOSES_ICD: https://mimic.mit.edu/docs/iii/tables/diagnoses_icd/

ROW_ID, SUBJECT_ID, HADM_ID, SEQ_NUM, ICD9_CODE
SUBJECT_ID is unique to a patient and HADM_ID is unique to a patient hospital stay.
SEQ_NUM provides the order in which the ICD diagnoses relate to the patient
ICD9_CODE contains the actual code corresponding to the diagnosis assigned to the patient for the given row

Links to:

PATIENTS on SUBJECT_ID
ADMISSIONS on HADM_ID
D_ICD_DIAGNOSES on ICD9_CODE

2. PROCEDURES_ICD: https://mimic.mit.edu/docs/iii/tables/procedures_icd/

ROW_ID, SUBJECT_ID, HADM_ID, SEQ_NUM, ICD9_CODE

ICD9_CODE provides the ICD-9 code for the given procedure

3. PRESCRIPTIONS: https://mimic.mit.edu/docs/iii/tables/prescriptions/

ROW_ID, SUBJECT_ID, HADM_ID, ICUSTAY_ID, DRUG

Links to:

PATIENTS on SUBJECT_ID
ADMISSIONS on HADM_ID
ICUSTAYS on ICUSTAY_ID

In [2]:
from pyhealth.datasets import MIMIC3Dataset

mimic3_ds = MIMIC3Dataset(
        root="https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/",
        tables=["DIAGNOSES_ICD", "PROCEDURES_ICD", "PRESCRIPTIONS"],
)

mimic3_ds.stat()


Statistics of base dataset (dev=False):
	- Dataset: MIMIC3Dataset
	- Number of patients: 49993
	- Number of visits: 52769
	- Number of visits per patient: 1.0555
	- Number of events per visit in DIAGNOSES_ICD: 9.1038
	- Number of events per visit in PROCEDURES_ICD: 3.2186
	- Number of events per visit in PRESCRIPTIONS: 25.9600



'\nStatistics of base dataset (dev=False):\n\t- Dataset: MIMIC3Dataset\n\t- Number of patients: 49993\n\t- Number of visits: 52769\n\t- Number of visits per patient: 1.0555\n\t- Number of events per visit in DIAGNOSES_ICD: 9.1038\n\t- Number of events per visit in PROCEDURES_ICD: 3.2186\n\t- Number of events per visit in PRESCRIPTIONS: 25.9600\n'

In [3]:
# data format
mimic3_ds.info()


dataset.patients: patient_id -> <Patient>

<Patient>
    - visits: visit_id -> <Visit> 
    - other patient-level info
    
    <Visit>
        - event_list_dict: table_name -> List[Event]
        - other visit-level info
    
        <Event>
            - code: str
            - other event-level info



In [4]:
mimic3_ds.patients['10092']

Patient 10092 with 2 visits

In [5]:
mimic3_ds.patients['10092'].index_to_visit_id

{0: '110429', 1: '110430'}

In [7]:
mimic3_ds.patients['10092'].visits['110429']

Visit 110429 from patient 10092 with 15 events from tables ['DIAGNOSES_ICD', 'PROCEDURES_ICD', 'PRESCRIPTIONS']

In [8]:
mimic3_ds.patients['10092'].visits['110429'].event_list_dict

{'DIAGNOSES_ICD': [Event with ICD9CM code 53081 from table DIAGNOSES_ICD,
  Event with ICD9CM code 5715 from table DIAGNOSES_ICD,
  Event with ICD9CM code 5185 from table DIAGNOSES_ICD,
  Event with ICD9CM code 5180 from table DIAGNOSES_ICD,
  Event with ICD9CM code 1623 from table DIAGNOSES_ICD,
  Event with ICD9CM code 5672 from table DIAGNOSES_ICD,
  Event with ICD9CM code 8748 from table DIAGNOSES_ICD,
  Event with ICD9CM code 7812 from table DIAGNOSES_ICD,
  Event with ICD9CM code 3229 from table DIAGNOSES_ICD],
 'PROCEDURES_ICD': [Event with ICD9PROC code 3409 from table PROCEDURES_ICD],
 'PRESCRIPTIONS': [Event with NDC code 00034120081 from table PRESCRIPTIONS,
  Event with NDC code 00054858324 from table PRESCRIPTIONS,
  Event with NDC code 63323026201 from table PRESCRIPTIONS,
  Event with NDC code 00781305714 from table PRESCRIPTIONS,
  Event with NDC code 00310013039 from table PRESCRIPTIONS]}

In [10]:
mimic3_ds.patients['10092'].visits['110429'].event_list_dict['DIAGNOSES_ICD'][0]

Event with ICD9CM code 53081 from table DIAGNOSES_ICD

In [11]:
mimic3_ds.patients['10092'].visits['110429'].event_list_dict['DIAGNOSES_ICD'][0].code

'53081'

In [12]:
mimic3_ds.patients['10092'].visits['110429'].get_code_list(table="DIAGNOSES_ICD")

['53081', '5715', '5185', '5180', '1623', '5672', '8748', '7812', '3229']

In [8]:
mimic3_ds.tables

['DIAGNOSES_ICD', 'PROCEDURES_ICD', 'PRESCRIPTIONS']

In [17]:
mimic3_ds.samples[0]

{'visit_id': '110342',
 'patient_id': '10009',
 'conditions': [['5789', '2724', '45620', '78659', '45981', '81601', '5848']],
 'procedures': [['4516']],
 'drugs': [['00008092355',
   '00409176230',
   '61553008348',
   '63323026201',
   '00904053061',
   '00781305714',
   '58177020211',
   '11523726808',
   '00777310533']],
 'label': 0}

## Define healthcare task 

In [9]:
from pyhealth.tasks import readmission_prediction_mimic3_fn

mimic3_ds = mimic3_ds.set_task(task_fn=readmission_prediction_mimic3_fn)
# stats info
mimic3_ds.stat()

Generating samples for readmission_prediction_mimic3_fn: 100%|███████████████| 49993/49993 [00:00<00:00, 567502.25it/s]

Statistics of sample dataset:
	- Dataset: MIMIC3Dataset
	- Task: readmission_prediction_mimic3_fn
	- Number of samples: 2194
	- Number of patients: 2116
	- Number of visits: 2194
	- Number of visits per patient: 1.0369
	- conditions:
		- Number of conditions per sample: 10.3414
		- Number of unique conditions: 2526
		- Distribution of conditions (Top-10): [('4019', 875), ('41401', 591), ('42731', 540), ('25000', 483), ('4280', 472), ('2724', 324), ('5849', 297), ('53081', 280), ('5990', 264), ('2720', 254)]
	- procedures:
		- Number of procedures per sample: 4.0789
		- Number of unique procedures: 815
		- Distribution of procedures (Top-10): [('3893', 458), ('3961', 419), ('9904', 393), ('8856', 250), ('3615', 234), ('9604', 222), ('966', 213), ('8872', 208), ('9671', 181), ('3722', 177)]
	- drugs:
		- Number of drugs per sample: 35.5228
		- Number of unique drugs: 2424
		- Distribution of drugs (Top-10): [('00338001702', 1023), ('00008084199', 903), ('51079025520', 893), ('00338004938




"Statistics of sample dataset:\n\t- Dataset: MIMIC3Dataset\n\t- Task: readmission_prediction_mimic3_fn\n\t- Number of samples: 2194\n\t- Number of patients: 2116\n\t- Number of visits: 2194\n\t- Number of visits per patient: 1.0369\n\t- conditions:\n\t\t- Number of conditions per sample: 10.3414\n\t\t- Number of unique conditions: 2526\n\t\t- Distribution of conditions (Top-10): [('4019', 875), ('41401', 591), ('42731', 540), ('25000', 483), ('4280', 472), ('2724', 324), ('5849', 297), ('53081', 280), ('5990', 264), ('2720', 254)]\n\t- procedures:\n\t\t- Number of procedures per sample: 4.0789\n\t\t- Number of unique procedures: 815\n\t\t- Distribution of procedures (Top-10): [('3893', 458), ('3961', 419), ('9904', 393), ('8856', 250), ('3615', 234), ('9604', 222), ('966', 213), ('8872', 208), ('9671', 181), ('3722', 177)]\n\t- drugs:\n\t\t- Number of drugs per sample: 35.5228\n\t\t- Number of unique drugs: 2424\n\t\t- Distribution of drugs (Top-10): [('00338001702', 1023), ('000080841

In [10]:
from pyhealth.datasets.splitter import split_by_patient
from pyhealth.datasets import split_by_patient, get_dataloader

# data split
train_dataset, val_dataset, test_dataset = split_by_patient(mimic3_ds, [0.8, 0.1, 0.1])

# create dataloaders (they are <torch.data.DataLoader> object)
train_loader = get_dataloader(train_dataset, batch_size=64, shuffle=True)
val_loader = get_dataloader(val_dataset, batch_size=64, shuffle=False)
test_loader = get_dataloader(test_dataset, batch_size=64, shuffle=False)

## Define ML Model 

In [11]:
from pyhealth.models import Transformer

model = Transformer(
    dataset=mimic3_ds,
    # look up what are available for "feature_keys" and "label_keys" in dataset.samples[0]
    feature_keys=["conditions", "procedures"],
    label_key="label",
    mode="binary",
)

## training

In [12]:
from pyhealth.trainer import Trainer

trainer = Trainer(model=model)
trainer.train(
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    epochs=3,
    monitor="pr_auc",
)

  from tqdm.autonotebook import trange
Transformer(
  (embeddings): ModuleDict(
    (conditions): Embedding(2528, 128, padding_idx=0)
    (procedures): Embedding(817, 128, padding_idx=0)
  )
  (linear_layers): ModuleDict()
  (transformer): ModuleDict(
    (conditions): TransformerLayer(
      (transformer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadedAttention(
            (linear_layers): ModuleList(
              (0-2): 3 x Linear(in_features=128, out_features=128, bias=False)
            )
            (output_linear): Linear(in_features=128, out_features=128, bias=False)
            (attention): Attention()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (feed_forward): PositionwiseFeedForward(
            (w_1): Linear(in_features=128, out_features=512, bias=True)
            (w_2): Linear(in_features=512, out_features=128, bias=True)
            (dropout): Dropout(p=0.5, inplace=False)
            (activation): GELU(appro

##  Evaluation

In [13]:
# option 1: use our built-in evaluation metric
score = trainer.evaluate(test_loader)
print (score)

# option 2: use our pyhealth.metrics to evaluate
from pyhealth.metrics.binary import binary_metrics_fn
y_true, y_prob, loss = trainer.inference(test_loader)
binary_metrics_fn(y_true, y_prob, metrics=["pr_auc"])

Evaluation: 100%|███████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 160.08it/s]


{'pr_auc': 0.12489719915468561, 'roc_auc': 0.461139896373057, 'f1': 0.0, 'loss': 0.4234482944011688}


Evaluation: 100%|███████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 137.89it/s]


{'pr_auc': 0.12489719915468561}