In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
import sklearn

from tensorflow import keras

from keras.layers import Input, Add, GaussianNoise, MaxPooling1D, BatchNormalization, Dense, Dropout,Reshape,Flatten, Conv1D, AveragePooling1D
from keras.models import Model
from tensorflow.keras import regularizers
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split 

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Load Data

In [None]:
AE_SOURCE = "frozen"

if AE_SOURCE == "frozen":
    meds_path = "./processed_data/drugs_stack_autoencoder.csv"
    diags_path = "./processed_data/features_diagnoses_stack_autoencoder.csv"
else:
    meds_path = "./processed_data/4_21_retrain_drugs_stack_autoencoder.csv"
    diags_path = "./processed_data/4_21_retrain_features_diagnoses_stack_autoencoder.csv"

meds = pd.read_csv(meds_path)
diags = pd.read_csv(diags_path)

demo_labs = pd.read_csv("./processed_data/demo_labs_mice_imputed_scaled.csv")  
cohort = pd.read_csv("processed_data/cohort.csv")

In [None]:
demo_labs.head(3)

In [None]:
meds.head(3)

In [None]:
diags.head(3)

In [None]:
cohort.head(3)

# Merge Datasets

In [None]:
meds_diag = pd.merge(meds, diags, on='MRN', how='inner')


meds_diag_demo_labs = pd.merge(meds_diag, demo_labs, on='MRN', how='inner')

meds_diag_demo_labs_cohort = pd.merge(meds_diag_demo_labs, cohort[['MRN', 'outcome']], on='MRN', how='inner')

meds_diag_demo_labs_cohort = meds_diag_demo_labs_cohort.set_index('MRN')

meds_diag_demo_labs_cohort = meds_diag_demo_labs_cohort.sample(frac = 1)

In [None]:
meds_diag_demo_labs_cohort.head()

In [None]:
len(meds_diag_demo_labs_cohort)

In [None]:
len(cohort)

In [None]:
df = meds_diag_demo_labs_cohort

# Imbalance

In [None]:
neg, pos = np.bincount(df['outcome'])
total = neg + pos
imbalance = pos / total
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100*imbalance))

df_Y = df['outcome']
df_X = df.drop(columns = ['outcome'], axis = 1, inplace = False)

In [None]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.20, shuffle=True, random_state=42)
# Initial split into training and temporary sets
X_train, X_temp, y_train, y_temp = train_test_split(df_X, df_Y, test_size=0.30, shuffle=True, random_state=42)

# Further split the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(2. / 3), shuffle=True, random_state=42)

# Now you have X_train, X_val, X_test, y_train, y_val, y_test


In [None]:
print(f'Features Train - {X_train.shape}, Outcomes Train - {y_train.shape}')
print(f'Features Val - {X_val.shape}, Outcomes Val - {y_val.shape}')
print(f'Features Test - {X_test.shape}, Outcomes Test - {y_test.shape}')

In [None]:
X_test.to_csv('data_test_set.csv', index=False)

# Modeling

## Finetuning the model using the autoencoder features

In [None]:
def train_save_model(X_train, y_train, X_val, y_val, X_test, y_test, outputmodel, outputpredictions, outputvalpredictions, N):
    
    # Load the original model
    model = keras.models.load_model("models/model_ALL_OF_US.keras")
    
    # Freeze all layers except for the last N
    print("All layers frozen except for last ", N)
    if N == 0 :
        for layer in model.layers: 
            layer.trainable = False
    else :
        for layer in model.layers[:-N]: 
            layer.trainable = False
    
    # Compile the model
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_auc', 
        verbose=1,
        patience=10,
        mode='max',
        restore_best_weights=True)

    metrics_list = [
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
    ]

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss=keras.losses.BinaryCrossentropy(),
                  metrics=metrics_list)

    # Continue training on new data
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val), epochs=150, shuffle=True,
                        batch_size=64, class_weight=class_weight, callbacks=[early_stopping])
    
    # Evaluate the model on the test set
    results = model.evaluate(X_test, y_test, batch_size=64, verbose=0)
    for name, value in zip(model.metrics_names, results):
        print(name, ': ', value)
        
    # Predict on the test set
    test_predictions = model.predict(X_test, batch_size=64)
    df_test_predictions = pd.DataFrame(test_predictions, columns=["Prediction"])
    df_test_predictions.index = X_test.index
    
    print("Saving test predictions to ", outputpredictions) 
    df_test_predictions.to_csv(outputpredictions)
    
    # Predict on the validation set
    val_predictions = model.predict(X_val, batch_size=64)
    df_val_predictions = pd.DataFrame(val_predictions, columns=["Prediction"])
    df_val_predictions.index = X_val.index
    
    print("Saving validation predictions to ", outputvalpredictions) 
    df_val_predictions.to_csv(outputvalpredictions)
    
    # Save the model
    print("Saving model to ", outputmodel) 
    model.save(outputmodel)  # Saves in Keras format

    return


In [None]:
# Sweep: % of training data (in 20% increments) Ã— number of unfrozen layers
Nlist = [20, 40, 60, 80, 100]
num_unfrozen_layers = [0, 1, 3, 6, 7, 8, 9, 11, 12, 15, 16, 18]

# Tag outputs based on which AE embeddings were used to build df (frozen vs retrained).
# AE_SOURCE should be set earlier in the notebook (e.g., "frozen" or "retrained").
run_tag = "retrain_autoencoders_" if AE_SOURCE == "retrained" else ""

for pct in Nlist:
    n_train = int(len(X_train) * (pct / 100.0))

    X_sub = X_train.head(n_train)
    y_sub = y_train.head(n_train)

    for n_unfrozen in num_unfrozen_layers:
        train_save_model(
            X_sub, y_sub,
            X_val, y_val,
            X_test, y_test,
            f"models_4_21/{run_tag}model_AoUencoder_StanfordFinetune-{n_unfrozen}-{pct}pct.keras",
            f"out/{run_tag}final_test_predictions_AoUencoder_StanfordFinetune-{n_unfrozen}-{pct}pct.csv",
            f"out/{run_tag}final_val_predictions_AoUencoder_StanfordFinetune-{n_unfrozen}-{pct}pct.csv",
            n_unfrozen,
        )