## Dense Neural Network


In [None]:
%matplotlib inline

In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import h5py
import statsmodels.api as sm
import shap

In [None]:
# set random seed
# NOTE: does not affect RNG on GPU
np.random.seed(1234)

In [None]:
# check that device_type GPU is available
tf.config.list_physical_devices(
    device_type='GPU'
)

In [None]:
# "best" trained weights directory
weights_dir = 'weights'
os.makedirs(weights_dir, exist_ok=True)

In [None]:
# read in data
with h5py.File('output/ukb_sample.h5', 'r') as hf:
    x_data = hf['x_data'][()]
    y_data = hf['y_data'][()].reshape((-1, 1))

# drop rows with missingness
badrows = np.isnan(y_data.flatten())
x_data = x_data[~badrows, ]
y_data = y_data[~badrows, ]

# drop outliers rows with y>5xSTD
badrows = abs(y_data.flatten()) >= (y_data.flatten().std() * 5)
x_data = x_data[~badrows, ]
y_data = y_data[~badrows, ]

In [None]:
# train/validation split
batch_size = 1024
maxlen = 2048  # 
split = 0.95  # 95%/5% training/validation split 
n_train = int(round(split * y_data.size / batch_size) * batch_size)
n_val = y_data.size - n_train

# draw random samples
randinds = np.random.permutation(np.arange(y_data.size))
x_train = x_data[randinds[:n_train], :maxlen]
y_train = y_data[randinds[:n_train]]

x_val = x_data[randinds[n_train:], :maxlen]
y_val = y_data[randinds[n_train:]]

In [None]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

In [None]:
# Define the Keras model
input_snp = keras.Input(shape=(x_train.shape[1],))
x = keras.layers.Dense(units=1, 
                       activation='linear',
                       )(input_snp)

model = keras.Model(inputs=input_snp, outputs=x)
opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss='mean_squared_error')  # Use an appropriate loss function

In [None]:
# get model summary
model.summary()

In [None]:
# plot model summary
keras.utils.plot_model(model, show_shapes=True, 
                       show_layer_activations=True, 
                       show_trainable=True)

In [None]:
# Early stop and weights save callbacks
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(weights_dir, 'geno_nn_dense.best.keras'),
    monitor='val_loss',
    mode='min',
    save_best_only=True)
# fit model
history = model.fit(x=x_train, y=y_train, epochs=1000, 
                    batch_size=batch_size, 
                    validation_data=(x_val, y_val),
                    callbacks=[early_stop, checkpoint])

In [None]:
# save the last update
model.save(os.path.join(weights_dir, 'geno_nn_dense.best.keras'))

In [None]:
# plot losses during training
plt.figure()
plt.semilogy(history.history['loss'])
plt.semilogy(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')

In [None]:
# load "best" model
model = keras.models.load_model(
    filepath=os.path.join(weights_dir, 'geno_nn_dense.best.keras')
)

In [None]:
# predict on train and validation set
y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)

In [None]:
# evalute performance using statsmodels.api.OLS
ols_train = sm.OLS(y_train_pred.flatten(), sm.add_constant(y_train.flatten()))
train_fit = ols_train.fit()

ols_val = sm.OLS( y_val_pred.flatten(), sm.add_constant(y_val.flatten()))
val_fit = ols_val.fit()

In [None]:
# plot predictions vs. actual value with regression lines
fig, ax = plt.subplots(1, 1)
ax.plot(y_train[::20], 
        y_train_pred[::20], 
        ',', label='train')
a, b = train_fit.params
ax.plot([y_train.min(), y_train.max()], 
        a + b * np.array([y_train.min(), y_train.max()]), 'C0', 
        label='R2={:.3f}'.format(train_fit.rsquared))

ax.plot(y_val, 
        y_val_pred, 
        ',', label='val')
a, b = val_fit.params
ax.plot([y_val.min(), y_val.max()], 
        a + b * np.array([y_val.min(), y_val.max()]), 'C1', 
        label='R2={:.3f}'.format(val_fit.rsquared))

ax.legend(loc='best')
plt.xlabel('$y$')
plt.ylabel(r'$\hat{y}$')

In [None]:
# correlations between predicted and actual values
print(np.corrcoef(y_train.flatten(), y_train_pred.flatten()))
print(np.corrcoef(y_val.flatten(), y_val_pred.flatten()))