In this notebook, we load and display data at various processing stages: raw `FakeRecord`, sequential `FakePatient`, vectorized `GenericEventSequence`, and sparse encoded `tf.SequenceExample`

In [None]:
#@title Imports
# Protocol buffers must first be compiled using `protoc`: see https://developers.google.com/protocol-buffers/docs/pythontutorial for details.from ehr_prediction_modeling.proto import fake_records_pb2
from ehr_prediction_modeling.proto import fake_patient_pb2
from ehr_prediction_modeling.proto import fake_generic_representation_pb2
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import os


In [None]:
data_dirpath = "path/to/directory/that/stores/data"  # @param

### Read Records

In [None]:
raw_data_path = os.path.join(data_dirpath, "fake_raw_records.pb")

with open(raw_data_path, "rb") as f:
  records = fake_records_pb2.FakeRecords.FromString(f.read()).records


In [None]:
print(f"There are {len(records)} records in the fake dataset.\nOne example is:\n\n{records[0]}")

### Read Patients

In [None]:
patient_path = os.path.join(data_dirpath, "fake_patients.pb")

with open(patient_path, "rb") as f:
  patients = fake_patient_pb2.FakePatients.FromString(f.read()).patients


In [None]:
print(f"There are {len(patients)} patients in the fake dataset.\nOne example is:\n\n{patients[0]}")

In [None]:
all_admissions = []
for patient in patients:
  for episode in patient.episodes:
    if episode.WhichOneof("episode_type") == "admission":
      all_admissions.append(episode.admission)

print(f"There are {len(all_admissions)} admissions in the fake dataset.\nOne example is:\n\n{all_admissions[0]}")

In [None]:
all_clinical_events = []
for patient in patients:
  for episode in patient.episodes:
    if episode.WhichOneof("episode_type") == "admission":
      for event in episode.admission.clinical_events:
        all_clinical_events.append(event)

print(f"There are {len(all_clinical_events)} clinical events in admission in the fake dataset.\nOne example is:\n\n{all_clinical_events[0]}")

### Read GenericEventSequence

In [None]:
vectorized_path = os.path.join(data_dirpath, "vectorized", "fake_vectorized_samples.pb")

with open(vectorized_path, "rb") as f:
  event_sequences = fake_generic_representation_pb2.FakeGenericEventSequences.FromString(f.read()).generic_event_sequences


In [None]:
print(f"There are {len(event_sequences)} event sequences in the fake dataset.\nOne example is:\n\n{event_sequences[0]}")

### `tf.SequenceExample`

In [None]:
split = "train"  # @param
seqex_path = os.path.join(data_dirpath, f"standardize/{split}.tfrecords")
filenames = [seqex_path]
raw_dataset = tf.data.TFRecordDataset(filenames)
iterator = raw_dataset.make_initializable_iterator()
init = tf.initialize_all_variables()
batch = iterator.get_next()

In [None]:
all_seqexs = []
with tf.train.MonitoredTrainingSession() as sess:
  sess.run(iterator.initializer)
  sess.run(init)
  while True:
    all_seqexs.append(tf.train.SequenceExample.FromString(sess.run(batch)))

In [None]:
print(f"There are {len(all_seqexs)} sequence examples in the fake dataset for {split} split.\nOne example is:\n\n{all_seqexs[0]}")