In [None]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow import keras

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    try:
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)    
    except FileNotFoundError as e:
        return False;

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Here we use pre-trained encoder model to create features for every patiens medical history. 
# Those features are then used for classification task which is done via MLP model. 

In [None]:
# Load pre-trained autoencoder model. 
model = keras.models.load_model('/content/drive/MyDrive/SIAP/models/full_gru_autoencoder_older_correct_18k_30_100')

In [None]:
# Load data.
data_no_observations = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL_PADDED_ARRAY")

In [None]:
data_labels = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_LABELS_18300")

In [None]:
# We need to delete NaN values. 
to_delete = []
for i in range(0, len(data_labels)):
  if type(data_labels[i]) is str:
    to_delete = to_delete
  else:
    to_delete.append(i)  

In [None]:
data_no_observations = np.delete(data_no_observations, to_delete, axis=0)
len(data_no_observations)

16561

In [None]:
data_labels = data_labels.dropna()
data_labels = data_labels.to_numpy()
type(data_labels)

numpy.ndarray

In [None]:
df = pd.DataFrame(data_labels, columns = ['REASONDESCRIPTION'])
unique = df['REASONDESCRIPTION'].unique()

In [None]:
len(data_labels)

16561

In [None]:
labels_unique = ['Sudden Cardiac Death',
 'Natural death with unknown cause',
 'Myocardial Infarction',
 'COVID-19',
 'Chronic congestive heart failure (disorder)',
 'Malignant neoplasm of breast (disorder)',
 'Concussion injury of brain',
 'Pneumonia',
 'Stroke',
 'Secondary malignant neoplasm of colon',
 'Chronic obstructive bronchitis (disorder)',
 'Pulmonary emphysema (disorder)',
 "Alzheimer's disease (disorder)",
 'Fracture of the vertebral column with spinal cord injury',
 'Non-small cell lung cancer (disorder)',
 'Burn injury(morphologic abnormality)',
 'Small cell carcinoma of lung (disorder)',
 'Neoplasm of prostate',
 'Cardiac Arrest',
 'Malignant tumor of colon',
 'Primary malignant neoplasm of colon',
 'Overlapping malignant neoplasm of colon',
 "Familial Alzheimer's disease of early onset (disorder)",
 'Death due to acute respiratory failure',
 'Meningomyelocele (disorder)',
 'Sepsis of Pseudomonas',
 'Postoperative complication',
 'Death due to sepsis',
 'End stage renal disease (disorder)']
print(labels_unique)

In [None]:
# labels_unique = np.array(labels_unique)

labels_unique = unique.reshape(-1, 1)
labels_unique.shape

(29, 1)

In [None]:
# Get labels for classification output. 
# Note: used with first, smaller set of data (which was not prepared well)
labels = np.array(data_no_observations['REASONDESCRIPTION'].unique())
labels = np.array(labels)
labels = np.delete(labels,3)
labels = labels.reshape(-1, 1)

# labels = np.delete(labels,3)
print(labels)
labels.shape

[['Sudden Cardiac Death']
 ['Natural death with unknown cause']
 ['Myocardial Infarction']
 ['COVID-19']
 ['Chronic congestive heart failure (disorder)']
 ['Malignant neoplasm of breast (disorder)']
 ['Concussion injury of brain']
 ['Pneumonia']
 ['Stroke']
 ['Secondary malignant neoplasm of colon']
 ['Chronic obstructive bronchitis (disorder)']
 ['Pulmonary emphysema (disorder)']
 ["Alzheimer's disease (disorder)"]
 ['Fracture of the vertebral column with spinal cord injury']
 ['Non-small cell lung cancer (disorder)']
 ['Burn injury(morphologic abnormality)']
 ['Small cell carcinoma of lung (disorder)']
 ['Neoplasm of prostate']
 ['Cardiac Arrest']
 ['Malignant tumor of colon']
 ['Primary malignant neoplasm of colon']
 ['Overlapping malignant neoplasm of colon']
 ["Familial Alzheimer's disease of early onset (disorder)"]
 ['Death due to acute respiratory failure']
 ['Meningomyelocele (disorder)']
 ['Sepsis of Pseudomonas']
 ['Postoperative complication']
 ['Death due to sepsis']
 ['En

(29, 1)

In [None]:
# One-hot-encode the labels.
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()
onehot_encoder.fit(labels_unique)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [None]:
def encode_labels(r):
  temp = np.array(r['REASONDESCRIPTION'])
  temp = temp.reshape(-1, 1)
  # print(temp)
  r['REASONDESCRIPTION'] = onehot_encoder.transform(temp).toarray()
  return r

In [None]:
# Remove all data where either death reason or vector is null (nan). 
data = data_no_observations.dropna()
data.head()


Unnamed: 0,test_col,REASONDESCRIPTION
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Myocardial Infarction
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death


In [None]:
data_labels = data_labels.reshape(-1, 1)
data_labels.shape

(16561, 1)

In [None]:
# Theese are the final labels 13.05.
lab = onehot_encoder.transform(data_labels).toarray()

In [None]:
data.apply(encode_labels, axis=1)
# data.head(5)

Unnamed: 0,test_col,REASONDESCRIPTION
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...
16834,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,..."
16836,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,..."
16837,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
16839,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
labels_final = np.array(data['REASONDESCRIPTION'])

In [None]:
labels_final[0]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [None]:
# For some reason labels turn out to be 3D arrays insted of 2D, so transform it. 
result = []
for i in range(len(labels_final)):
    result.append(labels_final[i][0])
print(result)    

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


In [None]:
result = np.array(result)

In [None]:
# Earlier setup, not needed for pre-padded data
# reasons_of_death = data_no_observations['R_E_A_S_O_N_D_E_S_C_R_I_P_T_I_O_N']
# reasons_of_death.head(2)

# reasons_of_death = reasons_of_death.array
# reasons_of_death = reasons_of_death.to_numpy()
# reasons_of_death = np.asarray([np.array(xz) for xz in reasons_of_death])
# print(reasons_of_death)

# Prepare for x train -> z. Same as preparing data for training GRU autoencoder model. 
data = data['test_col']
data = data.array
data = data.to_numpy()
y = np.asarray([np.array(xi) for xi in data])
x = np.array(y[0:len(result)])
z = tf.keras.preprocessing.sequence.pad_sequences(
    x, maxlen=20, dtype='int32', padding='pre',
    truncating='pre', value=0.0
)

  return array(a, dtype, copy=False, order=order)


In [None]:
# Prepare labels -> for ansamble (one model per class) -> NOT USED HERE. HERE WE USE ONE MULTI_LABEL CLASSIFICATION MODEL. 
data_no_observations['IS_SELECTED_DEATH_REASON'] = data_no_observations['R_E_A_S_O_N_D_E_S_C_R_I_P_T_I_O_N'] == 'Chronic congestive heart failure (disorder)'

reasons_of_death = data_no_observations['IS_SELECTED_DEATH_REASON'].to_numpy()
# reasons_of_death = reasons_of_death[7000:14000] # Should show WRONG results for label "1" in clasiffication report
reasons_of_death = reasons_of_death[0:20000] # Should show GOOD results for label "1" in clasiffication report
reasons_of_death = reasons_of_death * 1

Unnamed: 0_level_0,test_col_,R_E_A_S_O_N_D_E_S_C_R_I_P_T_I_O_N,IS_SELECTED_DEATH_REASON
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00032a55-fb87-c742-ad10-0773a82bb52b,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death,False
00058442-c52b-8e4e-6297-a4063fe79a14,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause,False
000cca33-5892-7015-edb0-e714ac012990,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death,False
000d0b7f-6196-f285-a9cb-4ead2b5e04ea,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Myocardial Infarction,False
000ee730-2474-459a-72ea-f31892298013,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death,False


In [None]:
# Load model for making vectors (take encoder from autoencoder model)
encoder_layer_trained = keras.Model(model.inputs, model.layers[1].output)
encoder_layer_trained.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30, 789)]         0         
_________________________________________________________________
gru (GRU)                    [(None, 256), (None, 256) 804096    
Total params: 804,096
Trainable params: 804,096
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Use envoder to make vectors from 'raw' data.
X = encoder_layer_trained.predict([data_no_observations])

In [None]:
len(X)

2

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X[0], lab, test_size=0.2, random_state = 42)

In [None]:
# Define classification model. 
# Input -> vectors that are outputs of the encoder model.
# Outputs -> labels for one cause of death. An array of 0 & 1.
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
import tensorflow as tf

# Notes:
# BatchNormalization recall from 0.5 to 0.8
# tf.keras.losses.CategoricalCrossentropy(from_logits=True) 0.95 recall, loss 0 at start...

def build_model(input_dimension):
    model = Sequential()
    model.add(Dense(1024, input_shape=(input_dimension,)))
    model.add(BatchNormalization())
    model.add(Activation('selu'))
    model.add(Dropout(0.2))
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(Activation('selu'))
    model.add(Dropout(0.2))
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(Activation('selu'))
    model.add(Dropout(0.2))
    model.add(Dense(29))
    model.add(Activation('softmax'))
    # Compile the model
    # model.compile(loss=tf.keras.losses.BinaryCrossentropy() , metrics=['accuracy'], optimizer='adam')

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True) , metrics=['accuracy'], optimizer='adam')
    return model

nn_model = build_model(X_train.shape[1])
nn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              263168    
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
activation (Activation)      (None, 1024)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               131200    
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0

In [None]:
num_epochs = 100
num_batch_size = 64
# nn_model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), verbose=1)

nn_model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_split=0.2, verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fc8aae05290>

In [None]:
# Evaluating the model on the training and testing set
score = nn_model.evaluate(X_train, y_train, verbose=0)
print("Training Accuracy: {0:.2%}".format(score[1]))
score = nn_model.evaluate(X_test, y_test, verbose=0)
print("Testing Accuracy: {0:.2%}".format(score[1]))

Training Accuracy: 87.18%
Testing Accuracy: 56.84%


In [None]:
# Manually check random predictions. 
import random
random_index = 12
# random_index = 11

print("INDEX: ", random_index)
temp = np.asarray(X_test[random_index])
print("SHAPE", temp.shape)

predict = nn_model.predict(temp.reshape(1, -1))
t = predict > 0.5
print("predict", predict)
print("PREDICTED", t)
print("Y_TEST", y_test[random_index])

INDEX:  12
SHAPE (256,)
predict [[2.3227332e-07 6.4076573e-08 4.1467737e-07 5.0995004e-07 6.8089825e-08
  6.8509567e-06 3.3888537e-06 2.0221812e-07 3.4983768e-07 8.0588912e-08
  7.0178606e-07 5.3442659e-09 4.0926429e-07 1.8751276e-08 1.0151355e-07
  9.9988508e-01 9.5882548e-05 6.1024387e-08 3.1546314e-08 1.3883663e-07
  6.6983837e-08 9.4970353e-08 1.7436265e-07 1.2907236e-07 8.5912532e-08
  4.3632068e-07 8.3443538e-08 2.7408175e-06 1.6746534e-06]]
PREDICTED [[False False False False False False False False False False False False
  False False False  True False False False False False False False False
  False False False False False]]
Y_TEST [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]


In [None]:
from sklearn.metrics import classification_report
y_pred = nn_model.predict(X_test, batch_size=64, verbose=1)
y_pred = (y_pred > 0.5)


print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.00      0.00      0.00        34
           2       0.26      0.31      0.28       403
           3       0.00      0.00      0.00        90
           4       0.96      0.94      0.95       184
           5       0.10      0.02      0.03        53
           6       0.11      0.04      0.06        77
           7       0.00      0.00      0.00         7
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00        13
          11       0.00      0.00      0.00        37
          12       0.82      0.70      0.75        33
          13       0.64      0.86      0.73        21
          14       0.50      1.00      0.67         1
          15       0.33      0.20      0.25       302
          16       0.49      0.62      0.54      1035
          17       0.00    