In [1]:
import numpy as np 
import pandas as pd 

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

In [2]:
from pyhealth.datasets import MIMIC3BaseDataset, MIMIC4BaseDataset, eICUBaseDataset, OMOPBaseDataset
base_dataset = MIMIC3BaseDataset(root="/srv/local/data/physionet.org/files/mimiciii/1.4")
# base_dataset = eICUBaseDataset(root="/srv/local/data/physionet.org/files/eicu-crd/2.0")
# base_dataset = MIMIC4BaseDataset(root="/srv/local/data/physionet.org/files/mimiciv/2.0/hosp")
# base_dataset = OMOPBaseDataset(root="/srv/local/data/zw12/pyhealth/raw_data/synpuf1k_omop_cdm_5.2.2")
base_dataset.info()

  from .autonotebook import tqdm as notebook_tqdm



        ----- Output Data Structure -----
        Dataset.patients: [
            {
                patient_id: patient_id, 
                visits: [
                    {
                        visit_id: visit_id, 
                        patient_id: patient_id, 
                        conditions: [List], 
                        procedures: [List],
                        drugs: [List],
                        visit_info: <dict>
                    }
                    ...
                ]                    
            } 
            ...
        ]
        


In [3]:
from pyhealth.tasks import DrugRecDataset
drug_rec_dataset = DrugRecDataset(base_dataset)
drug_rec_dataset.info()


        ----- Output Data Structure -----
        >> drug_rec_dataloader[0]
        >> {
            "conditions": List[tensor],
            "procedures": List[tensor],
            "drugs": List[tensor]
        }
        


In [4]:
for i in range(len(drug_rec_dataset)):
    if len(drug_rec_dataset[i]["conditions"]) == 2:
        print(drug_rec_dataset[i])

{'conditions': tensor([[1171,  148, 2241, 3748, 1563,  361, 2310, 1650,    0,    0,    0,    0,
            0,    0,    0],
        [ 128, 1428,  148, 3094, 3271, 2426, 3459,  475, 1456, 3478,  483, 1668,
         2728, 2437, 2649]]), 'procedures': tensor([[ 744,  849,  682,    0,    0,    0,    0],
        [ 513, 1291, 1143,  576,  746,  386,  895]]), 'drugs': tensor([[1591, 2236, 2130, 1213, 2648,   70, 1957, 2301, 2899, 2088, 1441, 3648,
         2475, 1166, 3654, 3340, 2549, 3179, 3661, 2215, 2861,  782,  731,  606,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [2130, 2131,  254, 3265,   18, 1236,  275, 2372, 3068, 2856, 1693,  782,
         1464, 3513, 14

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [5]:
voc_size = drug_rec_dataset.voc_size
params = drug_rec_dataset.params

In [6]:
import torch.nn as nn

condition_embedding = nn.Sequential(
    nn.Embedding(voc_size[0], 64, padding_idx=0),
    nn.Dropout(0.5)
)
condition_embedding

Sequential(
  (0): Embedding(4493, 64, padding_idx=0)
  (1): Dropout(p=0.5, inplace=False)
)

In [7]:
procedure_embedding = nn.Sequential(
    nn.Embedding(voc_size[1], 64, padding_idx=0),
    nn.Dropout(0.5)
)
procedure_embedding

Sequential(
  (0): Embedding(1414, 64, padding_idx=0)
  (1): Dropout(p=0.5, inplace=False)
)

In [8]:
from pyhealth.data import split

drug_rec_trainset, drug_rec_valset, drug_rec_testset = split.random_split(drug_rec_dataset, [0.8, 0.1, 0.1])

In [9]:
drug_rec_trainset[4358]['conditions']

tensor([[3628,  148,  617, 3988, 2988, 1190, 4223, 2330, 2485,  459, 3296, 4032,
         1985, 1554, 3963, 3094, 1488, 3459,  205, 3179, 1650],
        [3628,  148, 1662, 1123, 2988, 1381, 1456, 4376, 1903, 3433,  459, 3296,
         3449,  879, 3963, 3233,  398, 3615, 2136, 1039, 3238]])

In [10]:
visit_embs = []

for i in range(len(drug_rec_dataset)):
    # visit embedding
    condition_emb = condition_embedding(drug_rec_dataset[i]['conditions']).sum(dim=1).data
    procedure_emb = condition_embedding(drug_rec_dataset[i]['procedures']).sum(dim=1).data
    visit_embs.append(condition_emb + procedure_emb)
    
visit_embs

[tensor([[ -4.8638,   4.0590,   5.3291,   1.8061,  -9.2246,   6.6510,   3.1418,
           -3.7625,   0.2628,  -2.3229,  -4.6559,  -9.6627,   6.2175,   2.0600,
            4.3492,  -3.0766,  -2.6601,  -3.9188,   1.2290,   9.8523,   0.1761,
            3.2698,  -0.7116,  -9.0266,   4.6811,   1.9788,  -1.7532,  -3.7075,
           -1.9753,   8.2494,  -8.9671,  -1.0683,   1.2374,   0.1504,   0.1533,
            3.8301,  -3.8554,   4.5923,   6.8059,  -3.8302,  -5.3842,   0.9876,
           -0.0354,  -7.7075,   3.6812,  -8.6764,   5.4480,  -0.6260,   7.3448,
            1.6862,   1.8989,  -1.0957,  -3.8282,  -6.2358,   1.8430,   8.3695,
           -2.2862,   3.4119,   0.7125,  -0.5775,   3.2637,   0.3013,   5.7591,
            5.7945],
         [ 22.5216,   6.5870,  10.3424,   5.0454,  -3.4079,   9.6315,   3.8612,
           -3.8916,  -8.2420,  -6.9871,  15.8830,   1.0141,   7.2657,   5.1657,
           -0.2870,   6.8740,  -4.8508,  -5.7205,  -4.0390,  12.3197,   6.4611,
            9.2565,

In [21]:
import torch

x_emb = []
y_emb = []
for patient in range(len(visit_embs)):
    for visit in range(len(visit_embs[patient])):
        x_emb.append(visit_embs[patient][visit].numpy())
        
        #drug multi-hot
        drugs_index = drug_rec_dataset[patient]['drugs'][visit]
        drugs_multihot = torch.zeros(1, voc_size[2])
        drugs_multihot[0][drugs_index] = 1
        y_emb.append(drugs_multihot[0].numpy())
        
X = np.array(x_emb, dtype=float)
y = np.array(y_emb, dtype=int)

X, y

(array([[ -4.86375523,   4.05895567,   5.32908916, ...,   0.3013044 ,
           5.7590723 ,   5.79454756],
        [ 22.52159309,   6.58704185,  10.34244156, ...,  -6.83209372,
          -2.18289566,  -4.25786448],
        [  2.64405084,  -3.91953611,   7.51582527, ...,  -6.68856049,
          -6.96205521, -15.33426571],
        ...,
        [ -9.01044846,   1.96132731,  -8.41982269, ...,  -4.95333767,
          -0.36999536,   0.2255131 ],
        [ -2.47074127,   7.97788095,  -0.64741725, ...,   2.02308345,
          -5.61092901,   5.57371759],
        [ -0.1891402 ,   5.1609292 ,  -9.58019638, ...,  -2.50805616,
          -2.81500387,   8.86745358]]),
 array([[1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1],
        [1, 0, 0, ..., 0, 0, 1]]))

In [41]:
idx = (int)(len(X) * 0.9)
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]


In [49]:
from sklearn.metrics import log_loss

xgb_estimator = XGBClassifier(objective='binary:logistic',tree_method='gpu_hist')
predictor = MultiOutputClassifier(xgb_estimator)

In [46]:
oof_preds = np.zeros(y_train.shape)
test_preds = np.zeros((X_test.shape[0], y_test.shape[1]))
oof_losses = []
kf = KFold(n_splits=5)
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print('Starting fold: ', fn)
    X_train_, X_val = X_train[trn_idx], X_train[val_idx]
    y_train_, y_val = y_train[trn_idx], y_train[val_idx]
    
    predictor.fit(X_train_, y_train_)
    val_preds = predictor.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    preds = predictor.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / 5
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))