In [None]:
import pandas as pd
import numpy as np

import keras
import keras.backend as K
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import datasets, layers, models, losses, Model, regularizers, optimizers, metrics
from random import randint
from sklearn.model_selection import train_test_split

In [None]:
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Helper Functions

In [None]:
cosine_similarity_loss = keras.losses.CosineSimilarity(reduction=tf.keras.losses.Reduction.NONE)

class CosineEmbeddingLoss(tf.keras.losses.Loss):
    def __init__(self, reduction=tf.keras.losses.Reduction.AUTO, name="CosineEmbeddingLoss"):
        super().__init__(reduction=reduction, name=name)
    
    def call(self, y_true, y_pred):
        similarity = -cosine_similarity_loss(y_true, y_pred)
        return tf.reduce_mean(1. - similarity)

In [None]:
def get_autoencoder_model(data_shape,latent_size = 128, regularizer_term = 10e-4):
    stacked_encoder = models.Sequential([
        layers.Flatten(),
        layers.Dense(1028, activation = 'selu',activity_regularizer=regularizers.l2(regularizer_term)),
        layers.Dense(512, activation = 'selu',activity_regularizer=regularizers.l2(regularizer_term)),
        layers.Dense(256, activation = 'selu',activity_regularizer=regularizers.l2(regularizer_term)),
        layers.Dense(latent_size, activation = 'selu',activity_regularizer=regularizers.l2(regularizer_term)),
    ])

    stacked_decoder = models.Sequential([
        layers.Flatten(),
        layers.Dense(256, activation = 'selu',activity_regularizer=regularizers.l2(regularizer_term)),
        layers.Dense(512, activation = 'selu',activity_regularizer=regularizers.l2(regularizer_term)),
        layers.Dense(1028, activation = 'selu',activity_regularizer=regularizers.l2(regularizer_term)),
        layers.Dense(data_shape[1], activation = 'sigmoid',activity_regularizer=regularizers.l2(regularizer_term)),
        layers.Reshape(data_shape[1:])
    ])

    stacked_autoencoder = models.Sequential([
        stacked_encoder,
        stacked_decoder
    ])

    stacked_autoencoder.build((None,data_shape[1]))
    stacked_autoencoder.compile(optimizers.Adam(learning_rate = 0.001), loss=CosineEmbeddingLoss())
    stacked_autoencoder.summary()
    
    return stacked_autoencoder, stacked_encoder, stacked_decoder

In [None]:
def get_data(file):
    data = pd.read_csv(file).set_index('MRN')
    data = data.astype('float')
    return data

# Train Model

In [None]:
input_file = 'processed_data/diag_data.csv'
output_file = 'processed_data/4_21_retrain_features_diagnoses_stack_autoencoder.csv'
title = 'Diagnoses Features'

regularizer_term = 10e-6
latent_size = 128

In [None]:
data = get_data(input_file)
model, stacked_encoder, stacked_decoder = get_autoencoder_model(data.shape, latent_size = 128, 
                                                                    regularizer_term = 10e-6)

# Train
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
history = model.fit(data, data, epochs=300, batch_size=256, callbacks=[callback])

In [None]:
fig, axs = plt.subplots(figsize=(7,7))
axs.plot(history.history['loss'])
axs.title.set_text('Training Loss For ' + title)
axs.set_xlabel('Epochs')
axs.set_ylabel('Loss')
axs.legend(['Train'])

# Generate Features and Save Output

In [None]:
# Predict
features_extractor_autoencoder = Model(inputs = stacked_encoder.inputs,
                                      outputs = stacked_encoder.get_layer(index=4).output)
features = features_extractor_autoencoder(data.values)

# Save
features_df = pd.DataFrame(data = features.numpy(), 
                    columns = [('diag_embed' + str(i+1)) for i in range(features.shape[1])]) 
features_df.insert(0, 'MRN', data.index)
features_df.to_csv(output_file, index = False)

In [None]:
features_extractor_autoencoder.save("models_4_21/diag_autoencoder_model.keras")

In [None]:
output_file

# Use All of Us Model

In [None]:
# input_file = 'processed_data/diag_data.csv'
# output_file = 'processed_data/features_diagnoses_stack_autoencoder.csv'
# title = 'Diagnoses Features'

# data = get_data(input_file)

# reconstructed_model = keras.models.load_model("models/diag_autoencoder_model_ALL_OF_US.keras")

# features = reconstructed_model(data.values)

# features_df = pd.DataFrame(data = features.numpy(), 
#                     columns = [('diag_embed' + str(i+1)) for i in range(features.shape[1])]) 
# features_df.insert(0, 'MRN', data.index)
# features_df.to_csv(output_file, index = False)