In [None]:
%matplotlib inline

In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import pandas_plink as pk
import h5py
import seaborn as sns
import statsmodels.api as sm

In [None]:
# check that device_type GPU is available
tf.config.list_physical_devices(
    device_type='GPU'
)

In [None]:
# read in data
with h5py.File('output/ukb_sample.h5', 'r') as hf:
    x_data = hf['x_data'][()]
    y_data = hf['y_data'][()].reshape((-1, 1))

# drop rows with missingness
badrows = np.isnan(y_data.flatten())
x_data = x_data[~badrows, ]
y_data = y_data[~badrows, ]

# drop outliers rows with y>5xSTD
badrows = abs(y_data.flatten()) >= (y_data.flatten().std() * 5)
x_data = x_data[~badrows, ]
y_data = y_data[~badrows, ]


In [None]:
x_data.shape, y_data.shape

In [None]:
# train/validation split
batch_size = 1024
maxlen = 2048
n_train = int(round(0.95 * y_data.size / batch_size) * batch_size)
n_val = y_data.size - n_train

x_train = x_data[:n_train, :maxlen]
y_train = y_data[:n_train]

x_val = x_data[n_train:n_train+n_val, :maxlen]
y_val = y_data[n_train:n_train+n_val]

In [None]:
# inputs
input_snp = keras.Input(shape=(x_train.shape[1], ))

# multi-layer perceptron (MLP)
x = keras.layers.Dense(64, activation='relu',
                       )(input_snp)
x = keras.layers.Dense(64, activation='relu',
                       )(x)
x = keras.layers.Dense(32, activation='relu',
                       )(x)
x = keras.layers.Dense(16, activation='relu',
                       )(x)
x = keras.layers.Dense(1, activation='linear',
                       )(x)

# residual block
y = keras.layers.Dense(1, activation='linear',
                       )(input_snp)

# add
x = keras.layers.add([x, y])

# model, adjusted learning parameters
model = keras.Model(inputs=input_snp, outputs=x)
opt = keras.optimizers.Adam(learning_rate=0.0001, 
                            beta_1=0.9, beta_2=0.99)
model.compile(optimizer=opt, loss='mean_squared_error')

In [None]:
model.summary()

In [None]:
# plot model summary
keras.utils.plot_model(model, show_shapes=True, 
                       show_layer_activations=True, 
                       show_trainable=True, dpi=60)

In [None]:
# Early stop and weights save callbacks
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join('weights', 'geno_nn_deepnull.best.keras'),
    monitor='val_loss',
    mode='min',
    save_best_only=True)
history = model.fit(x=x_train, y=y_train, 
                    epochs=1000, 
                    batch_size=batch_size, 
                    validation_data=(x_val, y_val), 
                    callbacks=[early_stop, checkpoint])

In [None]:
# save the last update
model.save(os.path.join('weights', 'geno_nn_deepnull.best.keras'))

In [None]:
plt.figure()
plt.semilogy(history.history['loss'])
plt.semilogy(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')

In [None]:
# load "best" model
model = keras.models.load_model(
    filepath=os.path.join('weights', 'geno_nn_dense.best.keras')
)

In [None]:
y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)

In [None]:
# evalute performance using statsmodels.api.OLS
ols_train = sm.OLS(y_train_pred.flatten(), sm.add_constant(y_train.flatten()))
train_fit = ols_train.fit()

ols_val = sm.OLS( y_val_pred.flatten(), sm.add_constant(y_val.flatten()))
val_fit = ols_val.fit()

In [None]:
# plot predictions vs. actual value with regression lines
fig, ax = plt.subplots(1, 1)
ax.plot(y_train[::20], 
        y_train_pred[::20], 
        ',', label='train')
a, b = train_fit.params
ax.plot([y_train.min(), y_train.max()], 
        a + b * np.array([y_train.min(), y_train.max()]), 'C0', 
        label='R2={:.3f}'.format(train_fit.rsquared))

ax.plot(y_val, 
        y_val_pred, 
        ',', label='val')
a, b = val_fit.params
ax.plot([y_val.min(), y_val.max()], 
        a + b * np.array([y_val.min(), y_val.max()]), 'C1', 
        label='R2={:.3f}'.format(val_fit.rsquared))

ax.legend(loc='best')
plt.xlabel('$y$')
plt.ylabel(r'$\hat{y}$')

In [None]:
print(np.corrcoef(y_train.flatten(), y_train_pred.flatten()))
print(np.corrcoef(y_val.flatten(), y_val_pred.flatten()))