# Modelling with PyHealth

## Using PyHealth and Try NOTEEVENTS_ICD

In [62]:
import pandas as pd

# carica i dati dai file csv
noteevents = pd.read_csv('data/NOTEEVENTS_ICD.csv')
patients = pd.read_csv('data/PATIENTS.csv')

# seleziona i pazienti presenti in NOTEEVENTS_ICD
selected_patients = patients[patients['SUBJECT_ID'].isin(noteevents['SUBJECT_ID'].unique())]

# salva il risultato in un file csv
selected_patients.to_csv('data/PATIENTS_SEL.csv', index=False)


In [1]:
from pyhealth.datasets import MIMIC3Dataset

dataset = MIMIC3Dataset(
    root='data/',
    tables=["DIAGNOSES_ICD","PROCEDURES_ICD","PRESCRIPTIONS","NOTEEVENTS_ICD"],
    code_mapping={"NDC": ("ATC", {"target_kwargs": {"level": 3}})},
)

In [2]:
dataset.stat()


Statistics of base dataset (dev=False):
	- Dataset: MIMIC3Dataset
	- Number of patients: 1553
	- Number of visits: 5004
	- Number of visits per patient: 3.2222
	- Number of events per visit in DIAGNOSES_ICD: 11.6063
	- Number of events per visit in PROCEDURES_ICD: 3.8767
	- Number of events per visit in PRESCRIPTIONS: 80.1681
	- Number of events per visit in NOTEEVENTS_ICD: 3.5548



'\nStatistics of base dataset (dev=False):\n\t- Dataset: MIMIC3Dataset\n\t- Number of patients: 1553\n\t- Number of visits: 5004\n\t- Number of visits per patient: 3.2222\n\t- Number of events per visit in DIAGNOSES_ICD: 11.6063\n\t- Number of events per visit in PROCEDURES_ICD: 3.8767\n\t- Number of events per visit in PRESCRIPTIONS: 80.1681\n\t- Number of events per visit in NOTEEVENTS_ICD: 3.5548\n'

In [3]:
# get patient dictionary
patient_dict = dataset.patients
print(list(patient_dict.keys()))
print(len(patient_dict))

['10055', '10071', '10077', '10144', '10160', '10174', '10197', '10207', '10254', '10285', '1029', '10302', '10304', '10310', '10328', '10342', '10369', '10414', '10434', '10442', '10478', '10487', '10488', '1050', '10594', '10612', '10618', '10634', '10635', '10637', '10660', '10679', '10687', '1069', '10742', '10774', '10820', '10832', '10859', '10884', '109', '10906', '10932', '10939', '10973', '10976', '10977', '11043', '11102', '11108', '11135', '11165', '11171', '112', '11205', '1122', '11234', '11236', '11242', '11255', '11318', '11342', '11346', '11421', '11442', '11464', '11479', '11588', '11590', '11604', '11623', '11634', '11716', '11763', '11764', '11778', '11818', '11825', '11830', '11838', '11861', '1187', '11876', '11885', '11892', '11897', '11932', '1197', '11970', '12039', '12076', '12110', '12113', '12125', '12149', '12198', '12203', '12217', '1223', '12281', '12344', '124', '12411', '12446', '12467', '12501', '12515', '12521', '12566', '12567', '12690', '12706', '127

In [4]:
# get the "10" patient
patient = patient_dict["25780"]
patient.gender, patient.birth_datetime, patient.ethnicity, patient.death_datetime, patient.visits
print(patient.visits)

OrderedDict([('120990', Visit 120990 from patient 25780 with 18 events from tables ['DIAGNOSES_ICD', 'PRESCRIPTIONS', 'NOTEEVENTS_ICD']), ('199988', Visit 199988 from patient 25780 with 102 events from tables ['DIAGNOSES_ICD', 'PROCEDURES_ICD', 'PRESCRIPTIONS', 'NOTEEVENTS_ICD'])])


In [5]:
# get the visit list of this patient
visit_dict = patient.visits
print (list(visit_dict.keys()))

['120990', '199988']


In [6]:
# get the first visit
visit = visit_dict['120990']
visit.encounter_time, visit.available_tables, visit.num_events, visit.event_list_dict

(datetime.datetime(2169, 5, 6, 19, 55),
 ['DIAGNOSES_ICD', 'PRESCRIPTIONS', 'NOTEEVENTS_ICD'],
 18,
 {'DIAGNOSES_ICD': [Event with ICD9CM code 56962 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2761 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2765 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2762 from table DIAGNOSES_ICD,
   Event with ICD9CM code 5990 from table DIAGNOSES_ICD,
   Event with ICD9CM code 99664 from table DIAGNOSES_ICD,
   Event with ICD9CM code 6000 from table DIAGNOSES_ICD,
   Event with ICD9CM code E8796 from table DIAGNOSES_ICD,
   Event with ICD9CM code 2859 from table DIAGNOSES_ICD],
  'PRESCRIPTIONS': [Event with ATC code B05X from table PRESCRIPTIONS,
   Event with ATC code J01M from table PRESCRIPTIONS,
   Event with ATC code A02B from table PRESCRIPTIONS,
   Event with ATC code B01A from table PRESCRIPTIONS],
  'NOTEEVENTS_ICD': [Event with ICD9CM code V419 from table NOTEEVENTS_ICD,
   Event with ICD9CM code 78703 from table NOTEEVENTS_IC

In [7]:
visit.get_code_list(table='NOTEEVENTS_ICD')


['V419', '78703', '7804', '78650', '78702']

In [8]:
visit.get_code_list('NOTEEVENTS_ICD')

['V419', '78703', '7804', '78650', '78702']

In [9]:
from pyhealth.tasks import drug_recommendation_mimic3_fn
from pyhealth.datasets import split_by_patient, get_dataloader

mimic3sample = dataset.set_task(task_fn=drug_recommendation_mimic3_fn) # use default task
train_ds, val_ds, test_ds = split_by_patient(mimic3sample, [0.6, 0.2, 0.2])


Generating samples for drug_recommendation_mimic3_fn: 100%|██████████| 1553/1553 [00:00<00:00, 8757.89it/s]


In [10]:
mimic3sample.stat()

Statistics of sample dataset:
	- Dataset: MIMIC3Dataset
	- Task: drug_recommendation_mimic3_fn
	- Number of samples: 1962
	- Number of patients: 885
	- Number of visits: 1962
	- Number of visits per patient: 2.2169
	- conditions:
		- Number of conditions per sample: 18.8710
		- Number of unique conditions: 2077
		- Distribution of conditions (Top-10): [('4019', 1211), ('4280', 978), ('41401', 760), ('42731', 744), ('5849', 568), ('25000', 529), ('53081', 451), ('2720', 382), ('2724', 381), ('40391', 377)]
	- procedures:
		- Number of procedures per sample: 6.9490
		- Number of unique procedures: 730
		- Distribution of procedures (Top-10): [('3893', 1001), ('9904', 746), ('9604', 506), ('3995', 487), ('9671', 463), ('966', 457), ('3891', 278), ('9672', 275), ('3961', 258), ('9915', 243)]
	- drugs:
		- Number of drugs per sample: 24.4715
		- Number of unique drugs: 178
		- Distribution of drugs (Top-10): [('A02B', 1756), ('B05X', 1749), ('B01A', 1647), ('N02B', 1643), ('A06A', 1528), ('

"Statistics of sample dataset:\n\t- Dataset: MIMIC3Dataset\n\t- Task: drug_recommendation_mimic3_fn\n\t- Number of samples: 1962\n\t- Number of patients: 885\n\t- Number of visits: 1962\n\t- Number of visits per patient: 2.2169\n\t- conditions:\n\t\t- Number of conditions per sample: 18.8710\n\t\t- Number of unique conditions: 2077\n\t\t- Distribution of conditions (Top-10): [('4019', 1211), ('4280', 978), ('41401', 760), ('42731', 744), ('5849', 568), ('25000', 529), ('53081', 451), ('2720', 382), ('2724', 381), ('40391', 377)]\n\t- procedures:\n\t\t- Number of procedures per sample: 6.9490\n\t\t- Number of unique procedures: 730\n\t\t- Distribution of procedures (Top-10): [('3893', 1001), ('9904', 746), ('9604', 506), ('3995', 487), ('9671', 463), ('966', 457), ('3891', 278), ('9672', 275), ('3961', 258), ('9915', 243)]\n\t- drugs:\n\t\t- Number of drugs per sample: 24.4715\n\t\t- Number of unique drugs: 178\n\t\t- Distribution of drugs (Top-10): [('A02B', 1756), ('B05X', 1749), ('B0

In [11]:
# create dataloaders (torch.data.DataLoader)
train_loader = get_dataloader(train_ds, batch_size=32, shuffle=True)
val_loader = get_dataloader(val_ds, batch_size=32, shuffle=False)
test_loader = get_dataloader(test_ds, batch_size=32, shuffle=False)

In [12]:
len(train_ds), len(val_ds), len(test_ds)

(1168, 403, 391)

## Model to test

Here we can use the models having the same input built in the previous cells. I tried GRASP, SafeDrug, Transformer. The first two are very slow, Transformer is the fastest.

### Transformer

In [43]:
from pyhealth.models import Transformer

model = Transformer(
    dataset=mimic3sample,
    feature_keys=["conditions", "procedures"],
    label_key="drugs",
    mode="multilabel",
)

### SafeDrug

**- SafeDrug without symptoms**

In [13]:
from pyhealth.models import SafeDrug

model = SafeDrug(
    dataset=mimic3sample
)

**- SafeDrug with symptoms**

In [13]:
from pyhealth.models import SafeDrug_Mod

model = SafeDrug_Mod(
    dataset=mimic3sample
)

### Training

In [14]:
from pyhealth.trainer import Trainer

trainer = Trainer(model=model)
trainer.train(
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    epochs=5,
    optimizer_params = {"lr": 2 * 1e-4},
    monitor="pr_auc_samples",
)

SafeDrug(
  (embeddings): ModuleDict(
    (conditions): Embedding(2079, 128, padding_idx=0)
    (procedures): Embedding(732, 128, padding_idx=0)
  )
  (cond_rnn): GRU(128, 128, batch_first=True)
  (proc_rnn): GRU(128, 128, batch_first=True)
  (query): Sequential(
    (0): ReLU()
    (1): Linear(in_features=256, out_features=128, bias=True)
  )
  (safedrug): SafeDrugLayer(
    (bipartite_transform): Linear(in_features=128, out_features=315, bias=True)
    (bipartite_output): Linear(in_features=315, out_features=178, bias=True)
    (mpnn): MolecularGraphNeuralNetwork(
      (embed_fingerprint): Embedding(226, 128)
      (W_fingerprint): ModuleList(
        (0-1): 2 x Linear(in_features=128, out_features=128, bias=True)
      )
    )
    (mpnn_output): Linear(in_features=178, out_features=178, bias=True)
    (mpnn_layernorm): LayerNorm((178,), eps=1e-05, elementwise_affine=True)
    (test): Linear(in_features=128, out_features=178, bias=True)
    (loss_fn): BCEWithLogitsLoss()
  )
)
Metri

  from tqdm.autonotebook import trange





Epoch 0 / 5:   0%|          | 0/37 [00:00<?, ?it/s]

--- Train epoch-0, step-37 ---
loss: 0.5289
ddi_loss: 0.0548


Evaluation: 100%|██████████| 13/13 [00:04<00:00,  3.24it/s]

--- Eval epoch-0, step-37 ---
pr_auc_samples: 0.5774
loss: 0.4059
New best pr_auc_samples score (0.5774) at epoch-0, step-37








Epoch 1 / 5:   0%|          | 0/37 [00:00<?, ?it/s]

--- Train epoch-1, step-74 ---
loss: 0.3405
ddi_loss: 0.0292


Evaluation: 100%|██████████| 13/13 [00:03<00:00,  3.39it/s]

--- Eval epoch-1, step-74 ---
pr_auc_samples: 0.6485
loss: 0.2703
New best pr_auc_samples score (0.6485) at epoch-1, step-74








Epoch 2 / 5:   0%|          | 0/37 [00:00<?, ?it/s]

--- Train epoch-2, step-111 ---
loss: 0.2717
ddi_loss: 0.0208


Evaluation: 100%|██████████| 13/13 [00:04<00:00,  3.20it/s]

--- Eval epoch-2, step-111 ---
pr_auc_samples: 0.6574
loss: 0.2643
New best pr_auc_samples score (0.6574) at epoch-2, step-111








Epoch 3 / 5:   0%|          | 0/37 [00:00<?, ?it/s]

--- Train epoch-3, step-148 ---
loss: 0.2640
ddi_loss: 0.0208


Evaluation: 100%|██████████| 13/13 [00:05<00:00,  2.23it/s]


--- Eval epoch-3, step-148 ---
pr_auc_samples: 0.6599
loss: 0.2603
New best pr_auc_samples score (0.6599) at epoch-3, step-148



Epoch 4 / 5:   0%|          | 0/37 [00:00<?, ?it/s]

--- Train epoch-4, step-185 ---
loss: 0.2662
ddi_loss: 0.0210


Evaluation: 100%|██████████| 13/13 [00:03<00:00,  3.47it/s]

--- Eval epoch-4, step-185 ---
pr_auc_samples: 0.6588
loss: 0.2596
Loaded best model





### Evaluation

In [15]:
### With Symptoms - loss 0.2649 - time: 2m 37s
trainer.evaluate(test_loader)

from pyhealth.metrics import multilabel_metrics_fn

y_true, y_prob, loss = trainer.inference(test_loader)
metrics = ["f1_samples", "pr_auc_samples", "jaccard_samples"]
multilabel_metrics_fn(y_true, y_prob, metrics=metrics)

Evaluation: 100%|██████████| 13/13 [00:05<00:00,  2.50it/s]
Evaluation: 100%|██████████| 13/13 [00:05<00:00,  2.57it/s]


{'f1_samples': 0.4733677079900265,
 'pr_auc_samples': 0.6388179397601255,
 'jaccard_samples': 0.31659545475861445}

In [16]:
### Without Symptoms - loss 0.2596 - time: 2m 49s
trainer.evaluate(test_loader)

from pyhealth.metrics import multilabel_metrics_fn

y_true, y_prob, loss = trainer.inference(test_loader)
metrics = ["accuracy", "f1_samples", "pr_auc_samples", "jaccard_samples"]
multilabel_metrics_fn(y_true, y_prob, metrics=metrics)


Evaluation: 100%|██████████| 13/13 [00:03<00:00,  3.35it/s]
Evaluation: 100%|██████████| 13/13 [00:04<00:00,  3.03it/s]


{'accuracy': 0.8911319290784218,
 'f1_samples': 0.47190582468443326,
 'pr_auc_samples': 0.6436704810452132,
 'jaccard_samples': 0.3154415107144935}

In [58]:
import numpy as np

y_prob_round = np.round(y_prob, 0)

# Trova gli indici delle colonne con valori pari a 1 per ogni riga
column_indices = [np.where(row == 1)[0] for row in y_prob_round]

indexes_prob=[]
# Stampa gli indici delle colonne per ogni riga
for i, indices in enumerate(column_indices):
    indexes_prob.append(indices)

indexes_prob

[array([  2,  10,  26,  27,  29,  38,  50,  57, 132, 133, 175, 176],
       dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  50,  57, 132, 133, 175, 176],
       dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  57, 132, 133, 175, 176], dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  50,  57, 132, 133, 175, 176],
       dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  57, 132, 133, 175, 176], dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  50,  57, 132, 133, 175, 176],
       dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  57, 132, 133, 175, 176], dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  50,  57, 132, 133, 175, 176],
       dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  57, 132, 133, 175, 176], dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  57, 132, 133, 175, 176], dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  57, 132, 133, 175, 176], dtype=int64),
 array([  2,  10,  26,  27,  29,  38,  57, 132, 133, 175, 176], dtype=int64),
 arr

In [61]:
import pandas as pd
import numpy as np

# create empty lists to store the recommended drugs, patient ids, and visit ids
rec_drug = []
patient_ids = []
visit_ids = []

# get the list of all drugs in the dataset
list_drugs = mimic3sample.get_all_tokens('drugs')

# iterate over the top indexes for each sample in test_ds
for sample, top in zip(test_ds, indexes_prob):
    
    # append the patient id and visit id to their respective lists
    patient_ids.append(sample['patient_id'])
    visit_ids.append(sample['visit_id'])
    
    # create an empty list to store the recommended drugs for this sample
    sample_rec_drug = []
    
    # iterate over the top indexes for this sample
    for i in top:
        
        # append the drug at the i-th index to the recommended drugs list for this sample
        sample_rec_drug.append(list_drugs[i])
    
    # append the recommended drugs for this sample to the recommended drugs list
    rec_drug.append(sample_rec_drug)

# create a dataframe with the patient ids, visit ids, and recommended drugs
df_rec_drug = pd.DataFrame({'patient_id': patient_ids, 'visit_id': visit_ids, 'rec_drug': rec_drug})
df_rec_drug.to_csv('data/rec_drug.csv', index=False)


In [70]:
# Trova gli indici delle colonne con valori pari a 1 per ogni riga
column_indices = [np.where(row == 1)[0] for row in y_true]

indexes=[]
# Stampa gli indici delle colonne per ogni riga
for i, indices in enumerate(column_indices):
    indexes.append(indices)

indexes

[array([  0,   2,   3,   6,   7,  10,  19,  20,  25,  26,  27,  29,  32,
         33,  38,  40,  41,  50,  57,  63,  81,  97, 103, 108, 130, 132,
        133, 141, 144, 154, 166, 168, 175, 176], dtype=int64),
 array([  1,   2,   3,   4,   6,   7,  13,  19,  20,  25,  26,  27,  29,
         33,  36,  38,  41,  50,  55,  57,  60,  63,  64,  81,  90,  95,
         97, 107, 108, 118, 130, 132, 133, 140, 141, 155, 156, 158, 166,
        168, 175, 176], dtype=int64),
 array([  2,  29,  34,  38,  63,  99, 102, 105, 130, 133, 140, 176],
       dtype=int64),
 array([  0,   2,   3,   6,  10,  19,  20,  25,  26,  27,  35,  38,  41,
         50,  57,  63,  81, 103, 128, 130, 132, 133, 144, 154, 175, 176],
       dtype=int64),
 array([  0,   1,   2,  25,  27,  29,  38,  40,  57,  60,  63,  97, 126,
        130, 133, 140, 141, 154, 174, 175, 176], dtype=int64),
 array([  1,  10,  29,  32,  33,  40,  60,  63,  81,  90,  97, 103, 107,
        122, 128, 132, 133, 139, 174, 175], dtype=int64),
 array([ 

In [72]:
import pandas as pd
import numpy as np

# create empty lists to store the recommended drugs, patient ids, and visit ids
true_drug = []
patient_ids = []
visit_ids = []


# get the list of all drugs in the dataset
list_drugs = mimic3sample.get_all_tokens('drugs')

# iterate over the top indexes for each sample in test_ds
for sample, top in zip(test_ds, indexes):
    
    # append the patient id and visit id to their respective lists
    patient_ids.append(sample['patient_id'])
    visit_ids.append(sample['visit_id'])
    
    # create an empty list to store the recommended drugs for this sample
    sample_true_drug = []
    
    # iterate over the top indexes for this sample
    for i in top:
        
        # append the drug at the i-th index to the recommended drugs list for this sample
        sample_true_drug.append(list_drugs[i])
    
    # append the recommended drugs for this sample to the recommended drugs list
    true_drug.append(sample_true_drug)

# create a dataframe with the patient ids, visit ids, and recommended drugs
df_true_drug = pd.DataFrame({'patient_id': patient_ids, 'visit_id': visit_ids, 'true_drug': true_drug})
df_true_drug.to_csv('data/true_drug.csv', index=False)


In [73]:
from pyhealth.medcode import InnerMap

# initialize an InnerMap
atc = InnerMap.load("ATC")

# select a patient from df_rec_drug
patient_id = 31385
visit_id = 124462
patient_df = df_rec_drug[df_rec_drug['patient_id'].astype(int)==patient_id]
patient_df = patient_df[patient_df['visit_id'].astype(int)==visit_id]

# iterate over the recommended drugs for the selected patient
for drugs in patient_df['rec_drug']:
    # lookup the ATC code for each drug and print it
    for drug in drugs:
        print(f"{drug}: {atc.lookup(drug)}")


A02B: DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHAGEAL REFLUX DISEASE (GORD)
A06A: DRUGS FOR CONSTIPATION
A12B: POTASSIUM SUPPLEMENTS
A12C: OTHER MINERAL SUPPLEMENTS in ATC
B01A: ANTITHROMBOTIC AGENTS
B05X: I.V. SOLUTION ADDITIVES
C03C: HIGH-CEILING DIURETICS
C07A: BETA BLOCKING AGENTS
N02A: OPIOID ANALGESICS
N02B: OTHER ANALGESICS AND ANTIPYRETICS in ATC
V04C: OTHER DIAGNOSTIC AGENTS in ATC
V06D: OTHER NUTRIENTS in ATC
