In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from lmmnn.nn import reg_nn_ohe_or_ignore, reg_nn_embed, reg_nn_lmm
from lmmnn.simulation import Count

import tensorflow.keras.backend as K

In [None]:
# After downloading the UK Biobank relevant data, run the ukb_blood_cancer.R ETL, here tryiglycerin is the predicted feature
ukb = pd.read_csv('ukb_triglyc_cancer.csv')
cols_to_keep = ['blood_triglyc', 'treatment_id', 'operation_id', 'diagnosis_id', 'cancer_id', 'histology_id',
  'weight', 'height_standing', 'gender', 'age', 'smoking', 'nap', 'sleepiness', 'dozing', 'tobacco',
  'skin_color', 'facial_aging', 'alive_father', 'alive_mother', 'hand_grip_left', 'hand_grip_right']

ukb = ukb[cols_to_keep]
print(ukb.shape)
ukb.head()

In [None]:
ukb['blood_triglyc'].plot(kind='hist', bins = 50)
plt.show()

In [None]:
print(len(ukb['operation_id'].unique()))
print(ukb['operation_id'].max())

In [None]:
n_cats_treatment = len(ukb['treatment_id'].unique())
n_cats_operation = len(ukb['operation_id'].unique())
n_cats_diagnosis = len(ukb['diagnosis_id'].unique())
n_cats_cancer = len(ukb['cancer_id'].unique())
n_cats_histology = len(ukb['histology_id'].unique())

print(f'no. of treatments: {n_cats_treatment}')
print(f'no. of operations: {n_cats_operation}')
print(f'no. of diagnoses: {n_cats_diagnosis}')
print(f'no. of cancer types: {n_cats_cancer}')
print(f'no. of cancer tumor histology: {n_cats_histology}')

In [None]:
ukb.rename(columns={'treatment_id': 'z0',
                    'operation_id': 'z1',
                    'diagnosis_id': 'z2',
                    'cancer_id': 'z3',
                    'histology_id': 'z4'}, inplace=True)

In [None]:
mode = 'intercepts'
n_sig2bs = 5
n_sig2bs_spatial = 0
q_spatial = None
est_cors = []
n_neurons = [10, 3]
activation = 'relu'
dropout = []
spatial_embedded_neurons = []
n_cats = [n_cats_treatment, n_cats_operation, n_cats_diagnosis, n_cats_cancer, n_cats_histology]
dist_matrix = None

In [None]:
def reg_nn(X_train, X_test, y_train, y_test, n_cats, batch=100, epochs=100, patience=10, reg_type='ohe', verbose=False):
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(X_train, X_test, y_train, y_test, n_cats, x_cols, batch, epochs, patience,
                                                           n_neurons, dropout, activation,
                                                           mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, _, _, n_epochs = reg_nn_lmm(X_train, X_test, y_train, y_test, n_cats, q_spatial, x_cols, batch, epochs, patience,
                                                 n_neurons, dropout, activation,
                                                 mode=mode, n_sig2bs=n_sig2bs, n_sig2bs_spatial=n_sig2bs_spatial,
                                                 est_cors=est_cors, dist_matrix=dist_matrix,
                                                 spatial_embed_neurons=spatial_embedded_neurons, verbose=verbose, log_params=False)
    elif reg_type == 'ignore':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(X_train, X_test, y_train, y_test, n_cats, x_cols, batch, epochs, patience,
                                                           n_neurons, dropout, activation,
                                                           mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose, ignore_RE=True)
    elif reg_type == 'embed':
        y_pred, sigmas, _, _, n_epochs = reg_nn_embed(X_train, X_test, y_train, y_test, n_cats, q_spatial, x_cols, batch, epochs, patience,
                                                   n_neurons, dropout, activation,
                                                   mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    else:
      raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    gc.collect()
    K.clear_session()
    mse = np.mean((y_pred - y_test)**2)
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.show()
    return mse, sigmas, n_epochs, end - start

In [None]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est',
                            'sigma_b0_est', 'sigma_b1_est', 'sigma_b2_est', 'sigma_b3_est', 'sigma_b4_est',
                            'n_epoch', 'time'])
counter = Count().gen()

def iterate_reg_types(X_train, X_test, y_train, y_test, verbose):
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='lmm', verbose=verbose)
    print(' finished lmm, mse: %.2f' % (mse_lmm))
    mse_ohe, _, n_epochs_ohe, time_ohe = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='ohe', verbose=verbose)
    print(' finished ohe, mse: %.2f' % (mse_ohe))
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='ignore', verbose=verbose)
    print(' finished ignore, mse: %.2f' % (mse_ig))
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='embed', verbose=verbose)
    print(' finished embed, mse: %.2f' % (mse_em))
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[1][0], sigmas[1][1], sigmas[1][2],
                              sigmas[1][3], sigmas[1][4], n_epochs_lmm, time_lmm]
    res.loc[next(counter)] = [i, 'ohe', mse_ohe, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, n_epochs_ohe, time_ohe]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, n_epochs_em, time_em]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
X, y = ukb.drop('blood_triglyc', axis=1), ukb['blood_triglyc']
x_cols = [col for col in X.columns if col not in ['z0', 'z1', 'z2', 'z3', 'z4']]

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y[train_index], y[test_index]
    scaler = StandardScaler()
    y_train = scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(X_train.shape[0])
    y_test = scaler.transform(y_test.values.reshape(-1, 1)).reshape(X_test.shape[0])
    y_train = pd.Series(y_train, index=X_train.index)
    y_test = pd.Series(y_test, index=X_test.index)
    scaler = StandardScaler()
    X_train[x_cols] = scaler.fit_transform(X_train[x_cols])
    X_test[x_cols] = scaler.transform(X_test[x_cols])
    iterate_reg_types(X_train, X_test, y_train, y_test, True)

In [None]:
res

In [None]:
res.to_csv('../../results/res_ukb_blood_triglyc.csv')