# Modelling with PyHealth

## Using PyHealth and Try NOTEEVENTS_ICD

In [62]:
import pandas as pd

# carica i dati dai file csv
noteevents = pd.read_csv('data/NOTEEVENTS_ICD.csv')
patients = pd.read_csv('data/PATIENTS.csv')

# seleziona i pazienti presenti in NOTEEVENTS_ICD
selected_patients = patients[patients['SUBJECT_ID'].isin(noteevents['SUBJECT_ID'].unique())]

# salva il risultato in un file csv
selected_patients.to_csv('data/PATIENTS_SEL.csv', index=False)


In [1]:
from pyhealth.datasets import MIMIC3Dataset

dataset = MIMIC3Dataset(
    root='data/',
    tables=["DIAGNOSES_ICD","PROCEDURES_ICD","PRESCRIPTIONS","NOTEEVENTS_ICD"],
    code_mapping={"NDC": ("ATC", {"target_kwargs": {"level": 3}})},
)

In [2]:
dataset.stat()


Statistics of base dataset (dev=False):
	- Dataset: MIMIC3Dataset
	- Number of patients: 1553
	- Number of visits: 5004
	- Number of visits per patient: 3.2222
	- Number of events per visit in DIAGNOSES_ICD: 11.6063
	- Number of events per visit in PROCEDURES_ICD: 3.8767
	- Number of events per visit in PRESCRIPTIONS: 80.1681
	- Number of events per visit in NOTEEVENTS_ICD: 3.5548



'\nStatistics of base dataset (dev=False):\n\t- Dataset: MIMIC3Dataset\n\t- Number of patients: 1553\n\t- Number of visits: 5004\n\t- Number of visits per patient: 3.2222\n\t- Number of events per visit in DIAGNOSES_ICD: 11.6063\n\t- Number of events per visit in PROCEDURES_ICD: 3.8767\n\t- Number of events per visit in PRESCRIPTIONS: 80.1681\n\t- Number of events per visit in NOTEEVENTS_ICD: 3.5548\n'

In [3]:
# get patient dictionary
patient_dict = dataset.patients
print(list(patient_dict.keys()))
print(len(patient_dict))

['10055', '10071', '10077', '10144', '10160', '10174', '10197', '10207', '10254', '10285', '1029', '10302', '10304', '10310', '10328', '10342', '10369', '10414', '10434', '10442', '10478', '10487', '10488', '1050', '10594', '10612', '10618', '10634', '10635', '10637', '10660', '10679', '10687', '1069', '10742', '10774', '10820', '10832', '10859', '10884', '109', '10906', '10932', '10939', '10973', '10976', '10977', '11043', '11102', '11108', '11135', '11165', '11171', '112', '11205', '1122', '11234', '11236', '11242', '11255', '11318', '11342', '11346', '11421', '11442', '11464', '11479', '11588', '11590', '11604', '11623', '11634', '11716', '11763', '11764', '11778', '11818', '11825', '11830', '11838', '11861', '1187', '11876', '11885', '11892', '11897', '11932', '1197', '11970', '12039', '12076', '12110', '12113', '12125', '12149', '12198', '12203', '12217', '1223', '12281', '12344', '124', '12411', '12446', '12467', '12501', '12515', '12521', '12566', '12567', '12690', '12706', '127

In [4]:
# get the "10" patient
patient = patient_dict["25780"]
patient.gender, patient.birth_datetime, patient.ethnicity, patient.death_datetime, patient.visits
print(patient.visits)

OrderedDict([('120990', Visit 120990 from patient 25780 with 18 events from tables ['DIAGNOSES_ICD', 'PRESCRIPTIONS', 'NOTEEVENTS_ICD']), ('199988', Visit 199988 from patient 25780 with 102 events from tables ['DIAGNOSES_ICD', 'PROCEDURES_ICD', 'PRESCRIPTIONS', 'NOTEEVENTS_ICD'])])


In [5]:
# get the visit list of this patient
visit_dict = patient.visits
print (list(visit_dict.keys()))

['120990', '199988']


In [6]:
# get the first visit
visit = visit_dict['120990']
visit.encounter_time, visit.available_tables, visit.num_events, visit.event_list_dict

(datetime.datetime(2169, 5, 6, 19, 55),
 ['DIAGNOSES_ICD', 'PRESCRIPTIONS', 'NOTEEVENTS_ICD'],
 18,
 {'DIAGNOSES_ICD': [Event with ICD9CM code 56962 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2761 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2765 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2762 from table DIAGNOSES_ICD,
   Event with ICD9CM code 5990 from table DIAGNOSES_ICD,
   Event with ICD9CM code 99664 from table DIAGNOSES_ICD,
   Event with ICD9CM code 6000 from table DIAGNOSES_ICD,
   Event with ICD9CM code E8796 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2859 from table DIAGNOSES_ICD],
  'PRESCRIPTIONS': [Event with ATC code B05X from table PRESCRIPTIONS,
   Event with ATC code J01M from table PRESCRIPTIONS,
   Event with ATC code A02B from table PRESCRIPTIONS,
   Event with ATC code B01A from table PRESCRIPTIONS],
  'NOTEEVENTS_ICD': [Event with ICD9CM code V419 from table NOTEEVENTS_ICD,
   Event with ICD9CM code 78703 from table NOTEEVENTS_IC

In [7]:
visit.get_code_list(table='NOTEEVENTS_ICD')


['V419', '78703', '7804', '78650', '78702']

In [8]:
visit.get_code_list('NOTEEVENTS_ICD')

['V419', '78703', '7804', '78650', '78702']

In [9]:
from pyhealth.tasks import drug_recommendation_mimic3_fn
from pyhealth.datasets import split_by_patient, get_dataloader

mimic3sample = dataset.set_task(task_fn=drug_recommendation_mimic3_fn) # use default task
train_ds, val_ds, test_ds = split_by_patient(mimic3sample, [0.6, 0.2, 0.2])


Generating samples for drug_recommendation_mimic3_fn: 100%|██████████| 1553/1553 [00:00<00:00, 13523.83it/s]


In [10]:
mimic3sample.stat()

Statistics of sample dataset:
	- Dataset: MIMIC3Dataset
	- Task: drug_recommendation_mimic3_fn
	- Number of samples: 1962
	- Number of patients: 885
	- Number of visits: 1962
	- Number of visits per patient: 2.2169
	- conditions:
		- Number of conditions per sample: 18.8710
		- Number of unique conditions: 2077
		- Distribution of conditions (Top-10): [('4019', 1211), ('4280', 978), ('41401', 760), ('42731', 744), ('5849', 568), ('25000', 529), ('53081', 451), ('2720', 382), ('2724', 381), ('40391', 377)]
	- procedures:
		- Number of procedures per sample: 6.9490
		- Number of unique procedures: 730
		- Distribution of procedures (Top-10): [('3893', 1001), ('9904', 746), ('9604', 506), ('3995', 487), ('9671', 463), ('966', 457), ('3891', 278), ('9672', 275), ('3961', 258), ('9915', 243)]
	- drugs:
		- Number of drugs per sample: 24.4715
		- Number of unique drugs: 178
		- Distribution of drugs (Top-10): [('A02B', 1756), ('B05X', 1749), ('B01A', 1647), ('N02B', 1643), ('A06A', 1528), ('

"Statistics of sample dataset:\n\t- Dataset: MIMIC3Dataset\n\t- Task: drug_recommendation_mimic3_fn\n\t- Number of samples: 1962\n\t- Number of patients: 885\n\t- Number of visits: 1962\n\t- Number of visits per patient: 2.2169\n\t- conditions:\n\t\t- Number of conditions per sample: 18.8710\n\t\t- Number of unique conditions: 2077\n\t\t- Distribution of conditions (Top-10): [('4019', 1211), ('4280', 978), ('41401', 760), ('42731', 744), ('5849', 568), ('25000', 529), ('53081', 451), ('2720', 382), ('2724', 381), ('40391', 377)]\n\t- procedures:\n\t\t- Number of procedures per sample: 6.9490\n\t\t- Number of unique procedures: 730\n\t\t- Distribution of procedures (Top-10): [('3893', 1001), ('9904', 746), ('9604', 506), ('3995', 487), ('9671', 463), ('966', 457), ('3891', 278), ('9672', 275), ('3961', 258), ('9915', 243)]\n\t- drugs:\n\t\t- Number of drugs per sample: 24.4715\n\t\t- Number of unique drugs: 178\n\t\t- Distribution of drugs (Top-10): [('A02B', 1756), ('B05X', 1749), ('B0

In [11]:
# create dataloaders (torch.data.DataLoader)
train_loader = get_dataloader(train_ds, batch_size=32, shuffle=True)
val_loader = get_dataloader(val_ds, batch_size=32, shuffle=False)
test_loader = get_dataloader(test_ds, batch_size=32, shuffle=False)

In [12]:
len(train_ds), len(val_ds), len(test_ds)

(1193, 381, 388)

## Model to test

Here we can use the models having the same input built in the previous cells. I tried GRASP, SafeDrug, Transformer. The first two are very slow, Transformer is the fastest.

### Transformer

In [43]:
from pyhealth.models import Transformer

model = Transformer(
    dataset=mimic3sample,
    feature_keys=["conditions", "procedures"],
    label_key="drugs",
    mode="multilabel",
)

### SafeDrug

**- SafeDrug senza sintomi**

In [22]:
from pyhealth.models import SafeDrug

model = SafeDrug(
    dataset=mimic3sample
)

**- SafeDrug con sintomi**

In [19]:
from pyhealth.models import SafeDrug_Mod

model = SafeDrug_Mod(
    dataset=mimic3sample
)

### Training

In [23]:
from pyhealth.trainer import Trainer

trainer = Trainer(model=model)
trainer.train(
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    epochs=5,
    optimizer_params = {"lr": 2 * 1e-4},
    monitor="pr_auc_samples",
)

SafeDrug(
  (embeddings): ModuleDict(
    (conditions): Embedding(2079, 128, padding_idx=0)
    (procedures): Embedding(732, 128, padding_idx=0)
  )
  (cond_rnn): GRU(128, 128, batch_first=True)
  (proc_rnn): GRU(128, 128, batch_first=True)
  (query): Sequential(
    (0): ReLU()
    (1): Linear(in_features=256, out_features=128, bias=True)
  )
  (safedrug): SafeDrugLayer(
    (bipartite_transform): Linear(in_features=128, out_features=315, bias=True)
    (bipartite_output): Linear(in_features=315, out_features=178, bias=True)
    (mpnn): MolecularGraphNeuralNetwork(
      (embed_fingerprint): Embedding(226, 128)
      (W_fingerprint): ModuleList(
        (0-1): 2 x Linear(in_features=128, out_features=128, bias=True)
      )
    )
    (mpnn_output): Linear(in_features=178, out_features=178, bias=True)
    (mpnn_layernorm): LayerNorm((178,), eps=1e-05, elementwise_affine=True)
    (test): Linear(in_features=128, out_features=178, bias=True)
    (loss_fn): BCEWithLogitsLoss()
  )
)
Metri

Epoch 0 / 5:   0%|          | 0/38 [00:00<?, ?it/s]

--- Train epoch-0, step-38 ---
loss: 0.5208
ddi_loss: 0.0000


Evaluation: 100%|██████████| 12/12 [00:03<00:00,  3.68it/s]


--- Eval epoch-0, step-38 ---
pr_auc_samples: 0.5522
loss: 0.3982
New best pr_auc_samples score (0.5522) at epoch-0, step-38



Epoch 1 / 5:   0%|          | 0/38 [00:00<?, ?it/s]

--- Train epoch-1, step-76 ---
loss: 0.3322
ddi_loss: 0.0000


Evaluation: 100%|██████████| 12/12 [00:02<00:00,  4.22it/s]

--- Eval epoch-1, step-76 ---
pr_auc_samples: 0.6331
loss: 0.2953
New best pr_auc_samples score (0.6331) at epoch-1, step-76








Epoch 2 / 5:   0%|          | 0/38 [00:00<?, ?it/s]

--- Train epoch-2, step-114 ---
loss: 0.2730
ddi_loss: 0.0000


Evaluation: 100%|██████████| 12/12 [00:02<00:00,  4.29it/s]

--- Eval epoch-2, step-114 ---
pr_auc_samples: 0.6446
loss: 0.2738
New best pr_auc_samples score (0.6446) at epoch-2, step-114








Epoch 3 / 5:   0%|          | 0/38 [00:00<?, ?it/s]

--- Train epoch-3, step-152 ---
loss: 0.2623
ddi_loss: 0.0000


Evaluation: 100%|██████████| 12/12 [00:02<00:00,  4.33it/s]

--- Eval epoch-3, step-152 ---
pr_auc_samples: 0.6439
loss: 0.2732






Epoch 4 / 5:   0%|          | 0/38 [00:00<?, ?it/s]

--- Train epoch-4, step-190 ---
loss: 0.2600
ddi_loss: 0.0000


Evaluation: 100%|██████████| 12/12 [00:02<00:00,  4.30it/s]

--- Eval epoch-4, step-190 ---
pr_auc_samples: 0.6453
loss: 0.2716
New best pr_auc_samples score (0.6453) at epoch-4, step-190





Loaded best model


### Evaluation

In [21]:
### With Symptoms - loss 0.2710 - time: 2m 18s
trainer.evaluate(test_loader)

# method 2
from pyhealth.metrics import multilabel_metrics_fn

y_true, y_prob, loss = trainer.inference(test_loader)
metrics = ["f1_samples", "pr_auc_samples", "jaccard_samples"]
multilabel_metrics_fn(y_true, y_prob, metrics=metrics)

Evaluation: 100%|██████████| 13/13 [00:03<00:00,  4.01it/s]
Evaluation: 100%|██████████| 13/13 [00:02<00:00,  4.33it/s]


{'f1_samples': 0.47480333526127183,
 'pr_auc_samples': 0.6429012943356481,
 'jaccard_samples': 0.317317914970469}

In [24]:
### Without Symptoms - loss 0.2716 - time: 2m 2s
trainer.evaluate(test_loader)

# method 2
from pyhealth.metrics import multilabel_metrics_fn

y_true, y_prob, loss = trainer.inference(test_loader)
metrics = ["f1_samples", "pr_auc_samples", "jaccard_samples"]
multilabel_metrics_fn(y_true, y_prob, metrics=metrics)


Evaluation: 100%|██████████| 13/13 [00:03<00:00,  4.31it/s]
Evaluation: 100%|██████████| 13/13 [00:03<00:00,  4.09it/s]


{'f1_samples': 0.47476754838659313,
 'pr_auc_samples': 0.6422909124891031,
 'jaccard_samples': 0.31729322056397047}