In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from lmmnn.nn import reg_nn_ohe_or_ignore, reg_nn_embed, reg_nn_lmm, reg_nn_svdkl, reg_nn_cnn
from lmmnn.simulation import Count

import tensorflow.keras.backend as K

In [None]:
# US Air Quality data on census tract level from CDC: 
# https://data.cdc.gov/Environmental-Health-Toxicology/Daily-Census-Tract-Level-PM2-5-Concentrations-2016/7vu4-ngxx
# Run pm25_etl.R script
pm25 = pd.read_csv('../../pm25_df.csv')
print(pm25.shape)
pm25.head()

In [None]:
pm25['pm25'] = np.log(pm25['pm25'])
pm25['pm25'].plot(kind='hist', bins = 20)
plt.show()

In [None]:
pm25.groupby(['lat', 'long']).size()

In [None]:
print(len(pm25['location_id'].unique()))
print(pm25['location_id'].max())

In [None]:
coords = pm25.groupby(['location_id','lat', 'long']).size().index.to_frame().values
dist_matrix = squareform(pdist(coords[:,1:])) ** 2
print(dist_matrix.shape)

In [None]:
pm25.rename({'lat': 'D1', 'long': 'D2', 'location_id': 'z0'}, axis=1, inplace=True)

In [None]:
mode = 'spatial'
# mode = 'spatial_embedded' # uncomment for LMMNN-E
batch = 100
epochs = 500
patience = 10
qs = []
q_spatial = len(pm25['z0'].unique())
n_neurons = [10, 3]
dropout = []
activation = 'relu'
Z_non_linear = False
Z_embed_dim_pct = 10
n_sig2bs = 0
n_sig2bs_spatial = 2
est_cors = []
time2measure_dict = None
spatial_embed_neurons = None 
# spatial_embed_neurons = [100, 50, 20, 10, 20, 50, 100] # uncomment for LMMNN-E
verbose = True
log_params = False
idx = None
shuffle = False
resolution = 100

In [None]:
def reg_nn(X_train, X_test, y_train, y_test, reg_type):
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(
            X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, _, _, n_epochs = reg_nn_lmm(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode,
            n_sig2bs, n_sig2bs_spatial, est_cors, dist_matrix, spatial_embed_neurons,
            verbose, Z_non_linear, Z_embed_dim_pct, log_params, idx, shuffle)
    elif reg_type == 'ignore':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(
            X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose, ignore_RE=True)
    elif reg_type == 'embed':
        y_pred, sigmas, _, _, n_epochs = reg_nn_embed(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'svdkl':
        y_pred, sigmas, _, _, n_epochs = reg_nn_svdkl(X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs,
            patience, n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'cnn':
        y_pred, sigmas, _, _, n_epochs = reg_nn_cnn(X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs,
            patience, n_neurons, dropout, activation, mode,
            n_sig2bs, n_sig2bs_spatial, est_cors, resolution, verbose)
    else:
        raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    K.clear_session()
    gc.collect()
    mse = np.mean((y_pred - y_test)**2)
    plt.scatter(y_test, y_pred, alpha = 0.5)
    plt.show()
    return mse, sigmas, n_epochs, end - start

In [None]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est', 'sigma_b0_est', 'sigma_b1_est', 'n_epoch', 'time'])
counter = 0

def iterate_reg_types(X_train, X_test, y_train, y_test):
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, reg_type='lmm')
    print(' finished lmmnn, mse: %.4f' % (mse_lmm))
    mse_ohe, _, n_epochs_ohe, time_ohe = reg_nn(X_train, X_test, y_train, y_test, reg_type='ohe')
    print(' finished ohe, mse: %.4f' % (mse_ohe))
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, reg_type='ignore')
    print(' finished ignore, mse: %.4f' % (mse_ig))
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, reg_type='embed')
    print(' finished embed, mse: %.4f' % (mse_em))
    mse_dkl, _, n_epochs_dkl, time_dkl = reg_nn(X_train, X_test, y_train, y_test, reg_type='svdkl')
    print(' finished svdkl, mse: %.4f' % (mse_dkl))
    mse_cnn, _, n_epochs_cnn, time_cnn = reg_nn(X_train, X_test, y_train, y_test, reg_type='cnn')
    print(' finished cnn, mse: %.4f' % (mse_cnn))
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[2][0], sigmas[2][1], n_epochs_lmm, time_lmm]
    res.loc[next(counter)] = [i, 'ohe', mse_ohe, np.nan, np.nan, np.nan, n_epochs_ohe, time_ohe]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, np.nan, n_epochs_em, time_em]
    res.loc[next(counter)] = [i, 'svdkl', mse_dkl, np.nan, np.nan, np.nan, n_epochs_dkl, time_dkl]
    res.loc[next(counter)] = [i, 'cnn', mse_cnn, np.nan, np.nan, np.nan, n_epochs_cnn, time_cnn]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
counter = Count().gen()
X, y = pm25.drop(['pm25'], axis=1), pm25['pm25']
x_cols = [col for col in X.columns if col not in ['z0']]
x_cols_to_scale = [col for col in x_cols if col not in ['D1', 'D2']]

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index].copy(), X.loc[test_index].copy(), y[train_index], y[test_index]
    iterate_reg_types(X_train, X_test, y_train, y_test)

In [None]:
res

In [None]:
res.to_csv('../../results/res_pm25.csv')