In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler

from lmmnn.nn import reg_nn_ohe_or_ignore, reg_nn_lmm, reg_nn_embed, reg_nn_rnn
from lmmnn.simulation import Count

import tensorflow.keras.backend as K

In [None]:
# Rossmann Store Sales dataset from Kaggle: https://www.kaggle.com/competitions/rossmann-store-sales/
# Run rossmann_etl.R script
rossmann = pd.read_csv('../../rossmann.csv')
rossmann['Store'] = rossmann['Store'] - 1
cols_to_drop = ['date', 'year']
rossmann.drop(cols_to_drop, axis=1, inplace=True)
print(rossmann.shape)
rossmann.head()

In [None]:
rossmann['Sales'].plot(kind='hist', bins=20)
plt.show()

In [None]:
print(len(rossmann['Store'].unique()))
print(rossmann['Store'].max())

In [None]:
rossmann.rename(columns={'Store': 'z0', 'Sales': 'y'}, inplace=True)

In [None]:
mode = 'slopes'
batch_size = 10
epochs = 500
patience = 10
n_sig2bs = 3
est_cors = []
n_neurons = [100, 50, 25, 12]
activation = 'relu'
dropout = [0.25, 0.25, 0.25]
spatial_embedded_neurons = []
dist_matrix = None
q_spatial = None
n_sig2bs_spatial = 0
n_cats = [len(rossmann['z0'].unique())]
time2measure_dict = {t: i for i, t in enumerate(np.sort(rossmann['t'].unique()))} # for RNN
pred_future = False # change this for future mode

In [None]:
def reg_nn(X_train, X_test, y_train, y_test, n_cats, batch=batch_size, epochs=epochs, patience=patience, reg_type='ohe', verbose=False):
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(X_train, X_test, y_train, y_test, n_cats, x_cols, batch, epochs, patience,
                                                           n_neurons, dropout, activation,
                                                           mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, _, _, n_epochs = reg_nn_lmm(X_train, X_test, y_train, y_test, n_cats, q_spatial, x_cols, batch, epochs, patience,
                                                 n_neurons, dropout, activation,
                                                 mode=mode, n_sig2bs=n_sig2bs, n_sig2bs_spatial=n_sig2bs_spatial,
                                                 est_cors=est_cors, dist_matrix=dist_matrix,
                                                 spatial_embed_neurons=spatial_embedded_neurons, verbose=verbose, log_params=False)
    elif reg_type == 'ignore':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(X_train, X_test, y_train, y_test, n_cats, x_cols, batch, epochs, patience,
                                                           n_neurons, dropout, activation,
                                                           mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose, ignore_RE=True)
    elif reg_type == 'embed':
        y_pred, sigmas, _, _, n_epochs = reg_nn_embed(X_train, X_test, y_train, y_test, n_cats, q_spatial, x_cols, batch, epochs, patience,
                                                   n_neurons, dropout, activation,
                                                   mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'rnn':
        rnn_res, sigmas, _, _, n_epochs = reg_nn_rnn(X_train, X_test, y_train, y_test, n_cats, x_cols, batch, epochs,
                                            patience, n_neurons, dropout, activation, mode, time2measure_dict,
                                            n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    else:
      raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    gc.collect()
    K.clear_session()
    if reg_type == 'rnn': # RNN computes MSE inside function currently
      mse = rnn_res
    else:
      mse = np.mean((y_pred - y_test)**2)
      plt.scatter(y_test, y_pred, alpha=0.5)
      plt.show()
    return mse, sigmas, n_epochs, end - start

In [None]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est',
                            'sigma_b0_est', 'sigma_b1_est', 'sigma_b2_est',
                            'n_epoch', 'time'])
counter = Count().gen()

def iterate_reg_types(X_train, X_test, y_train, y_test, verbose):
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='lmm', verbose=verbose)
    print(' finished lmm, mse: %.4f' % (mse_lmm))
    mse_rnn, _, n_epochs_rnn, time_rnn = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='rnn', verbose=verbose)
    print(' finished rnn, mse: %.4f' % (mse_rnn))
    mse_ohe, _, n_epochs_ohe, time_ohe = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='ohe', verbose=verbose)
    print(' finished ohe, mse: %.4f' % (mse_ohe))
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='ignore', verbose=verbose)
    print(' finished ignore, mse: %.4f' % (mse_ig))
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='embed', verbose=verbose)
    print(' finished embed, mse: %.4f' % (mse_em))
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[1][0], sigmas[1][1], sigmas[1][2],
                              n_epochs_lmm, time_lmm]
    res.loc[next(counter)] = [i, 'rnn', mse_rnn, np.nan, np.nan, np.nan, np.nan, n_epochs_rnn, time_rnn]
    res.loc[next(counter)] = [i, 'ohe', mse_ohe, np.nan, np.nan, np.nan, np.nan, n_epochs_ohe, time_ohe]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, np.nan, np.nan, n_epochs_em, time_em]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

if pred_future:
  # test set is "the future" or those obs with largest t
  rossmann.sort_values(['t'], inplace=True)
  X, X_future, y, y_future = train_test_split(
      rossmann.drop('y', axis=1), rossmann['y'], test_size=0.2, shuffle=False)
  X.index = np.arange(X.shape[0])
  y.index = np.arange(X.shape[0])
else:
  X, y = rossmann.drop('y', axis=1), rossmann['y']

x_cols = [col for col in X.columns if col not in ['z0']]
x_cols_to_scale = [col for col in x_cols if col not in ['t']]

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    if not pred_future:
      X_train, X_test, y_train, y_test = X.loc[train_index].copy(), X.loc[test_index].copy(), y[train_index], y[test_index]
    else:
      X_train, X_test, y_train, y_test = X.loc[train_index].copy(), X_future.copy(), y[train_index], y_future.copy()
    scaler = StandardScaler()
    X_train[x_cols_to_scale] = scaler.fit_transform(X_train[x_cols_to_scale])
    X_test[x_cols_to_scale] = scaler.transform(X_test[x_cols_to_scale])
    iterate_reg_types(X_train, X_test, y_train, y_test, True)

In [None]:
res

In [None]:
if pred_future:
    res_file = '../../results/res_rossmann_future.csv'
else:
    res_file = '../../results/res_rossmann_random.csv'
res.to_csv(res_file)