In [None]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    try:
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)    
    except FileNotFoundError as e:
        return False;

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Used for smaller amount of data that needs to be proccessed & padded
data_no_observations = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL")

In [None]:
# Only array
data_no_observations = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL_PADDED_ARRAY")

In [None]:
# Check if loaded data is ok.
data_no_observations.head(2)

Unnamed: 0,test_col,REASONDESCRIPTION
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause


In [None]:
# We will use only vectors for training GRU /LSTM model. Death reasons are used in classification. 
data_no_observations = data_no_observations['test_col']

In [None]:
# Convert to arrays.
data_no_observations = data_no_observations.array
data_no_observations = data_no_observations.to_numpy()

In [None]:
# Check shape of data.
# print(data)
len(data_no_observations[1500][0])

789

In [None]:
input_seq_len = 30
num_of_features = 789
# output_len = len(data)

batch_size = 64  # Batch size for training.
epochs = 5  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.

In [None]:
# Define architecture of autoencoder model. 
import tensorflow as tf
from tensorflow import keras

# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(input_seq_len, num_of_features))
encoder = keras.layers.GRU(latent_dim, return_sequences=False, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)

encoder_states = [state_h]
repeat_vector = keras.layers.RepeatVector(input_seq_len)(encoder_outputs)
# Set up the decoder, using `encoder_states` as initial state.
# decoder_inputs = keras.Input(shape=(25, 773))

# We set up our decoder to return full output sequences.
decoder_gru = keras.layers.GRU(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(repeat_vector, initial_state=encoder_states)
decoder_dense = keras.layers.TimeDistributed( keras.layers.Dense(num_of_features, activation="softmax"))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that should learn to replicate inputs to outputs.

model = keras.Model([encoder_inputs], decoder_outputs)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30, 789)]    0                                            
__________________________________________________________________________________________________
gru (GRU)                       [(None, 256), (None, 804096      input_1[0][0]                    
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 30, 256)      0           gru[0][0]                        
__________________________________________________________________________________________________
gru_1 (GRU)                     [(None, 30, 256), (N 394752      repeat_vector[0][0]              
                                                                 gru[0][1]                    

In [None]:
# Turn singular data into arrays.
data_no_observations = np.asarray([np.array(xi) for xi in data_no_observations])
print(type(y))
print(type(y[0]))

  return array(a, dtype, copy=False, order=order)


NameError: ignored

In [None]:
data_no_observations.shape

(18300, 30, 789)

In [None]:
# Apply left padding with zero vectors -> needed for GRU as sequences have to be the same length and not all patients have the same number of encounters. 
# We are using 25 here as some average. Largest len is 378 (pregnant ladies) and lowest is 1
# note: changed to 30 with big data.
data_no_observations = np.array(data_no_observations[0:18000])
data_no_observations = tf.keras.preprocessing.sequence.pad_sequences(
    data_no_observations, maxlen=input_seq_len, dtype='int32', padding='pre',
    truncating='pre', value=0.0
)

In [None]:
# Compile & train the model. For 100 epochs, 20k patients with 25 sequences each and with sequence with 773 features ~ 2.33 hours. 
# z was data_no_observations, x was array of y[0:18000]
model.compile(
    optimizer="adam", loss=tf.keras.losses.MeanSquaredError(), metrics=["binary_accuracy"]
)
model.fit(
    [data_no_observations],
    data_no_observations,
    batch_size=batch_size,
    epochs=50,
    validation_split=0.2,
)
# Save model
# model.save("s2s")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f1338524810>

In [None]:
z[0].shape

(20, 741)

In [None]:
# Now check the model
pred = model.predict([np.array([data_no_observations[0]])])

In [None]:
# Random example. Sixt value from back has largest value (e-0.1) (this is correct only for first set of data used - smaller)
print(pred[0][29])
print(pred[0][29][774])

[2.48844742e-07 2.54318735e-07 1.30279361e-07 1.95978771e-07
 2.02391760e-07 2.74603593e-07 3.74588012e-07 2.50207819e-07
 3.55413619e-07 3.83734516e-07 2.97165997e-07 1.97948054e-07
 1.18265689e-07 2.27467723e-07 3.63826018e-07 3.32251034e-07
 1.69099962e-07 2.60585949e-07 2.03290313e-07 2.15519918e-07
 7.70639986e-07 2.11569585e-07 3.73365765e-07 3.00179266e-07
 2.38354815e-07 2.13611983e-07 3.67485768e-07 4.03380994e-07
 1.93316865e-07 3.81345330e-07 4.45437536e-07 8.01609360e-07
 2.75415829e-07 2.76618266e-07 1.94740167e-07 2.57096758e-07
 1.79828561e-07 2.37624434e-07 2.42180874e-07 3.24918943e-07
 8.55533654e-06 6.58730357e-07 1.80851714e-07 3.58094582e-07
 1.59191387e-07 2.65372790e-07 3.12715969e-07 1.97662349e-08
 1.95875629e-07 1.44452642e-07 5.33575894e-06 8.36690504e-08
 2.70594910e-07 2.30982423e-07 1.37284189e-07 1.81062930e-07
 1.52919228e-07 3.26179759e-07 2.46955807e-07 4.90876459e-07
 2.47942025e-05 1.79868522e-07 2.58038028e-07 2.36644993e-07
 2.08426414e-07 1.345409

In [None]:
data_no_observations[0][29]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# Save the model.
model.save('/content/drive/MyDrive/SIAP/models/full_gru_autoencoder_older_correct_16k_20_100')



INFO:tensorflow:Assets written to: /content/drive/MyDrive/SIAP/models/full_gru_autoencoder_older_correct_16k_20_100/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/SIAP/models/full_gru_autoencoder_older_correct_16k_20_100/assets


In [None]:
# Code for testing encoder extraction and use. 
encoder_layer_output = model.layers[1].output
type(encoder_layer_output)

list

In [None]:
encoder_layer_trained = keras.Model(model.inputs, encoder_layer_output)

In [None]:
encoder_layer_trained.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 25, 773)]         0         
_________________________________________________________________
gru_5 (GRU)                  [(None, 256), (None, 256) 791808    
Total params: 791,808
Trainable params: 791,808
Non-trainable params: 0
_________________________________________________________________


In [None]:
# FOr some reason it outputs the same vector twice. Just check if they are the same.
yhat = encoder_layer_trained.predict([np.array([z[0]])])
print(yhat[1][0])
print("------------------------------------------------------------")
print(yhat[0][0])

[-1.06442757e-02  2.02435255e-02 -1.80304479e-02 -4.75575142e-02
  6.55461103e-02 -3.47465500e-02  2.09050090e-03  2.61925980e-02
 -7.84722902e-03  2.11736187e-04  4.30897810e-03  4.88354936e-02
  1.83283128e-02 -3.37496176e-02 -1.11839503e-01 -1.65849030e-02
  7.95850903e-03 -3.05578113e-02 -1.58472732e-02 -6.88725710e-02
 -3.62739190e-02 -7.50635713e-02  1.13790436e-02  6.64694142e-03
 -5.03522381e-02 -1.19717745e-03  4.58245650e-02 -3.51478942e-02
  2.69196462e-02  1.69833731e-02 -1.24788824e-02  1.95391830e-02
  4.49730791e-02 -1.60837919e-02 -8.56579244e-02  9.95591190e-03
 -9.05110240e-02 -1.44101046e-02  2.49181390e-02  6.03699423e-02
  2.94272155e-02  9.81521048e-03 -3.86529462e-03 -1.01325251e-02
  2.77904849e-02 -4.80142720e-02  4.54416461e-02 -3.55549939e-02
  7.29211513e-03 -9.25499946e-03  1.56723689e-02 -1.89871192e-02
 -1.82648432e-02 -1.01485565e-01 -4.27753329e-02  2.05858983e-02
  4.58098538e-02  4.76474017e-02  2.61290073e-02 -7.02462047e-02
 -1.00957807e-02  3.62652