In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import RepeatedKFold

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

In [2]:
# Upload Exposome data
expo_cov = pd.read_csv("expo_cov.csv")
pheno = pd.read_csv("phenotype.csv")

expo_cov = pd.DataFrame(expo_cov)
phenotype = pd.DataFrame(pheno)

In [3]:
# Exposome dataframe: exposures + covariates
expo_cov

Unnamed: 0.1,Unnamed: 0,h_abs_ratio_preg_Log,h_no2_ratio_preg_Log,h_pm10_ratio_preg_None,h_pm25_ratio_preg_None,hs_no2_dy_hs_h_Log,hs_no2_wk_hs_h_Log,hs_no2_yr_hs_h_Log,hs_pm10_dy_hs_h_None,hs_pm10_wk_hs_h_None,...,h_mbmi_None,hs_wgtgain_None,e3_gac_None,h_age_None,h_edumc_None,h_native_None,h_parity_None,hs_child_age_None,hs_c_height_None,hs_c_weight_None
0,1,0.896711,2.872304,25.948498,17.433798,2.530279,2.583284,2.612098,22.535828,20.850005,...,25.510204,17.0,41.000000,28.000000,2,2,0,6.165640,1.220,23.40
1,2,0.892538,2.980008,25.897739,18.470850,1.928600,2.652479,2.761064,14.077763,29.141274,...,26.491508,18.0,41.000000,22.841553,3,2,1,6.992471,1.220,27.60
2,3,0.778723,3.056501,26.087347,18.711547,2.882591,2.591756,2.356163,46.859096,31.530981,...,30.116213,11.0,39.000000,34.232422,3,2,1,6.110883,1.280,37.50
3,4,0.089056,3.089157,14.991380,16.409771,1.390750,2.456717,2.403247,29.817442,25.232778,...,21.048048,21.0,39.285713,32.725529,1,2,1,10.138261,1.345,27.70
4,5,0.604781,3.848211,35.197296,14.889958,3.204449,3.499594,3.307663,29.817442,24.891465,...,22.151022,20.0,43.000000,20.865160,1,2,0,9.451175,1.370,34.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,1297,-0.099819,2.254548,35.827999,12.954850,1.635889,2.109734,2.393117,16.596676,22.231124,...,33.564014,17.0,40.428570,32.150581,2,2,1,6.809149,1.234,29.70
1297,1298,1.043402,3.819085,26.850889,15.318656,3.129574,3.281948,3.547043,19.266689,25.232778,...,22.491350,9.0,39.571430,31.731691,3,2,0,8.635295,1.200,23.60
1298,1299,0.864024,2.872304,24.950790,18.562578,2.709897,2.664830,2.636456,43.036453,47.726555,...,27.379665,20.0,41.285713,22.031279,3,2,2,6.497034,1.250,26.50
1299,1300,0.270579,3.288932,18.258001,13.714355,2.945648,3.724925,3.467167,13.025550,21.158041,...,20.796730,20.0,41.571430,36.000000,3,2,1,8.114990,1.336,24.60


In [4]:
# Phenotype dataframe
phenotype

Unnamed: 0.1,Unnamed: 0,ID,e3_bw,hs_asthma,hs_zbmi_who,hs_correct_raven,hs_Gen_Tot,hs_bmi_c_cat
0,1,1,4100,0,0.30,18,84.0000,2
1,2,2,4158,0,0.41,25,39.0000,2
2,3,3,4110,1,3.33,13,40.0000,4
3,4,4,3270,0,-0.76,28,54.5000,2
4,5,5,3950,0,0.98,19,18.0000,2
...,...,...,...,...,...,...,...,...
1296,1297,1297,2900,0,1.94,31,34.4375,3
1297,1298,1298,3420,0,-0.46,34,28.5625,2
1298,1299,1299,4068,1,1.04,22,13.0000,3
1299,1300,1300,4000,0,-1.18,32,18.3125,2


In [5]:
# Model definition
def get_model(n_inputs, n_outputs):
    model = keras.Sequential()
    model.add(layers.Dense(128, input_dim=n_inputs, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(n_outputs))
    
    optimizer = Adam(lr=1e-6)
    
    model.compile(loss='mae', optimizer='adam')
    return model

In [6]:
# Model evaluation 
def evaluate_model(X, y):
    results = list()
    n_inputs, n_outputs = X.shape[1], y.shape[1]
    # define evaluation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # enumerate folds
    for train_ix, test_ix in cv.split(X):
        # prepare data
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        # define model
        model = get_model(n_inputs, n_outputs)
        # fit the model
        model.fit(X_train, y_train, verbose=0, epochs=100)
        # evaluate the model 
        mae = model.evaluate(X_test, y_test, verbose=0)
        # store results
        print('>%.3f' % mae)
        results.append(mae)
    return results

In [7]:
# set dataset
X = expo_cov.drop(columns=['Unnamed: 0']).select_dtypes(exclude=['object']).to_numpy().astype('float32')
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
X = pd.DataFrame(x_scaled).to_numpy()

y = phenotype.drop(columns=['ID', 'Unnamed: 0']).to_numpy()
y_scaled = min_max_scaler.fit_transform(y)
y = pd.DataFrame(y_scaled).to_numpy()

# evaluate model
results = evaluate_model(X, y)
# summarize performance
print('MAE: %.3f (%.3f)' % (np.mean(results), np.std(results)))

>0.105
>0.099
>0.093
>0.105
>0.105
>0.096
>0.102
>0.101
>0.097
>0.103
>0.095
>0.105
>0.098
>0.097
>0.102
>0.102
>0.100
>0.093
>0.114
>0.109
>0.104
>0.098
>0.094
>0.102
>0.097
>0.104
>0.102
>0.105
>0.102
>0.103
MAE: 0.101 (0.005)
