In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from lmmnn.nn import reg_nn_ohe_or_ignore, reg_nn_embed, reg_nn_lmm
from lmmnn.simulation import Count

import tensorflow.keras.backend as K

In [None]:
# IMDB dataset from Kaggle: https://www.kaggle.com/datasets/wrandrall/imdb-new-dataset
# Run imdb_etl.R script
imdb = pd.read_csv('../../data/imdb_df2.csv')
print(imdb.shape)
imdb.head()

In [None]:
imdb['score'].plot(kind='hist')
plt.show()

In [None]:
print(len(imdb['director_id'].unique()))
print(imdb['director_id'].max())

In [None]:
n_cats_director = len(imdb['director_id'].unique())
n_cats_type = len(imdb['type_id'].unique())
print(f'no. of directors: {n_cats_director}')
print(f'no. of types: {n_cats_type}')

In [None]:
imdb.rename(columns={'director_id': 'z0',
                    'type_id': 'z1'}, inplace=True)

In [None]:
batch = 100
epochs = 100
patience = 10
mode = 'intercepts'
n_sig2bs = 2
est_cors = []
n_neurons = [10, 3]
activation = 'relu'
dropout = []
spatial_embedded_neurons = []
qs = [n_cats_director, n_cats_type]
dist_matrix = None
q_spatial = None
Z_non_linear = False
Z_embed_dim_pct = 10
n_sig2bs_spatial = 0
time2measure_dict = None
spatial_embed_neurons = None
resultion = None
verbose = True
log_params = False
idx = None

In [None]:
def reg_nn(X_train, X_test, y_train, y_test, reg_type):
    start = time.time()
    if reg_type == 'lmm':
        y_pred, sigmas, _, _, n_epochs = reg_nn_lmm(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode,
            n_sig2bs, n_sig2bs_spatial, est_cors, dist_matrix, spatial_embed_neurons, verbose, Z_non_linear, Z_embed_dim_pct, log_params, idx)
    elif reg_type == 'ignore':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(
            X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose, ignore_RE=True)
    elif reg_type == 'embed':
        y_pred, sigmas, _, _, n_epochs = reg_nn_embed(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    else:
        raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    K.clear_session()
    gc.collect()
    y_pred = np.clip(y_pred, 1, 10)
    mse = np.mean((y_pred - y_test)**2)
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.show()
    return mse, sigmas, n_epochs, end - start

In [None]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est',
                            'sigma_b0_est', 'sigma_b1_est', 'n_epoch', 'time'])
counter = Count().gen()

def iterate_reg_types(X_train, X_test, y_train, y_test):
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, reg_type='lmm')
    print(' finished lmm, mse: %.4f' % (mse_lmm))
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, reg_type='ignore')
    print(' finished ignore, mse: %.4f' % (mse_ig))
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, reg_type='embed')
    print(' finished embed, mse: %.4f' % (mse_em))
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[1][0], sigmas[1][1],
                              n_epochs_lmm, time_lmm]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, np.nan, n_epochs_em, time_em]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
X, y = imdb.drop('score', axis=1), imdb['score']
x_cols = [col for col in X.columns if col not in ['z0', 'z1', 'z2']]

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y[train_index], y[test_index]
    iterate_reg_types(X_train, X_test, y_train, y_test)

In [None]:
res

In [None]:
res.to_csv('../../results/res_imdb.csv')