In [None]:
!pip install tensorflow==2.15

In [None]:
!unzip copnn.zip

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)


In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform

from copnn.regression import run_regression
from copnn.simulation import Count
from copnn.modes.spatial_categorical import SpatialCategorical
from copnn.distributions import get_distribution

In [None]:
# Used Cars from Craigslist dataset from Kaggle: https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data
# Run cars_etl.R script
cars = pd.read_csv('../data/cars_df5.csv')
cols_to_drop = ['location_id']
cars.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
print(cars.shape)
cars.head()

In [None]:
cars['price'].plot(kind='hist', bins = 20)
plt.show()

In [None]:
cars['price'] = np.log(cars['price'])
cars['price'].plot(kind='hist', bins = 20)
plt.show()

In [None]:
cars[['lat', 'long']] = cars[['lat', 'long']].round(1)

In [None]:
cars.groupby(['lat', 'long']).size()

In [None]:
location_df = cars.groupby(['lat', 'long']).size().index.to_frame()
location_df['location_id'] = np.arange(location_df.shape[0])
location_df.head()

In [None]:
cars['id'] = np.arange(cars.shape[0])
cars = cars.set_index(['lat', 'long']).join(location_df[['location_id']]).reset_index().sort_values(by=['id']).drop(['id'], axis=1)
cars.index = np.arange(cars.shape[0])
cars.shape

In [None]:
cars[['lat', 'long', 'location_id']].head()

In [None]:
print(len(cars['location_id'].unique()))
print(cars['location_id'].max())

In [None]:
print(len(cars['model_id'].unique()))
print(cars['model_id'].max())

In [None]:
coords = cars.groupby(['location_id','lat', 'long']).size().index.to_frame().values
coords[:5]

In [None]:
dist_matrix = squareform(pdist(coords[:,1:])) ** 2
dist_matrix.shape

In [None]:
cars.rename({'lat': 'D1', 'long': 'D2', 'location_id': 'z0', 'model_id': 'z1'}, axis=1, inplace=True)

In [None]:
cars.columns

In [None]:
def get_mode(mode_par):
  if mode_par == 'categorical':
      mode = Categorical()
  elif mode_par == 'longitudinal':
      mode = Longitudinal()
  elif mode_par == 'spatial':
      mode = Spatial()
  elif mode_par == 'spatial_categorical':
      mode = SpatialCategorical()
  else:
      raise NotImplementedError(f'{mode_par} mode not implemented.')
  return mode

In [None]:
y_type = 'continuous'
mode = get_mode('spatial_categorical')
batch = 100
epochs = 200
patience = None
n_sig2bs = 1
n_sig2bs_spatial = 2
est_cors = []
n_neurons = [10, 3]
activation = 'relu'
dropout = []
spatial_embedded_neurons = []
qs = [len(cars['z1'].unique())]
q_spatial = len(cars['z0'].unique())
Z_non_linear = False
Z_embed_dim_pct = 10
time2measure_dict = None
pred_future = False # change this for future mode in longitudinal data
spatial_embed_neurons = None
resolution = None
verbose = True
log_params = False
idx = None
shuffle = False
b_true = None
distributions = ['laplace', 'gumbel', 'loggamma', 'logistic', 'skewnorm']
pred_unknown_clusters = True # change this for unknown locations in test in spatial data
sample_n_train = 30000 # make sure sample_n_train is on the maximum it can get, I got to 30K (default 10K)

In [None]:
res = pd.DataFrame(columns=['experiment', 'distribution', 'exp_type', 'mse_no_re', 'mse', 'sigma_e_est',
                            'sigma_b0_est', 'sigma_b0_est_spatial', 'sigma_b1_est_spatial', 'rho_est',
                            'nll_tr', 'nll_te', 'n_epoch', 'time'])
kf = KFold(n_splits=10, shuffle=True, random_state=42)
counter = Count().gen()

X, y = cars.drop(['price'], axis=1), cars['price']

x_cols = [col for col in X.columns if col not in ['z0', 'z1']]
x_cols_to_scale = [col for col in x_cols if col not in ['D1', 'D2']]
file_name = 'res_cars_copnn.csv'

In [None]:
def iterate_reg_types(X_train, X_test, y_train, y_test, counter, i, verbose):
    print(f'  started ignore...')
    res_ignore = run_regression(
        X_train, X_test, y_train, y_test, qs, q_spatial, x_cols,
        batch, epochs, patience, n_neurons, dropout, activation, 'ignore',
        Z_non_linear, Z_embed_dim_pct, mode, y_type, n_sig2bs, n_sig2bs_spatial, est_cors,
        dist_matrix, time2measure_dict, spatial_embed_neurons, resolution, verbose,
        log_params, idx, shuffle, None, b_true)
    print('  finished ignore, mse: %.3f' % res_ignore.metric_mse)
    res.loc[next(counter)] = [i, 'gaussian', 'ignore', res_ignore.metric_mse_no_re, res_ignore.metric_mse,
                              None, None, None, None, res_ignore.sig_ratio,
                              res_ignore.nll_tr, res_ignore.nll_te, res_ignore.n_epochs, res_ignore.time]
    print(f'  skipping ohe...')
    # res_ohe = run_regression(
    #     X_train, X_test, y_train, y_test, qs, q_spatial, x_cols,
    #     batch, epochs, patience, n_neurons, dropout, activation, 'ohe',
    #     Z_non_linear, Z_embed_dim_pct, mode, y_type, n_sig2bs, n_sig2bs_spatial, est_cors,
    #     dist_matrix, time2measure_dict, spatial_embed_neurons, resolution, verbose,
    #     log_params, idx, shuffle, None, b_true)
    # print('  finished ohe, mse: %.3f' % res_ohe.metric_mse)
    # res.loc[next(counter)] = [i, 'gaussian', 'ohe', res_ohe.metric_mse_no_re, res_ohe.metric_mse,
    #                           None, None, None, None, res_ohe.sig_ratio,
    #                           res_ohe.nll_tr, res_ohe.nll_te, res_ohe.n_epochs, res_ohe.time]
    print(f'  started embedding...')
    res_embed = run_regression(
        X_train, X_test, y_train, y_test, qs, q_spatial, x_cols,
        batch, epochs, patience, n_neurons, dropout, activation, 'embed',
        Z_non_linear, Z_embed_dim_pct, mode, y_type, n_sig2bs, n_sig2bs_spatial, est_cors,
        dist_matrix, time2measure_dict, spatial_embed_neurons, resolution, verbose,
        log_params, idx, shuffle, None, b_true)
    print('  finished embed, mse: %.3f' % res_embed.metric_mse)
    res.loc[next(counter)] = [i, 'gaussian', 'embed', res_embed.metric_mse_no_re, res_embed.metric_mse,
                              None, None, None, None, res_embed.sig_ratio,
                              res_embed.nll_tr, res_embed.nll_te, res_embed.n_epochs, res_embed.time]
    print(f'  started lmmnn...')
    res_lmmnn = run_regression(
        X_train, X_test, y_train, y_test, qs, q_spatial, x_cols,
        batch, epochs, patience, n_neurons, dropout, activation, 'lmmnn',
        Z_non_linear, Z_embed_dim_pct, mode, y_type, n_sig2bs, n_sig2bs_spatial, est_cors,
        dist_matrix, time2measure_dict, spatial_embed_neurons, resolution, verbose,
        log_params, idx, shuffle, None, b_true, sample_n_train = sample_n_train)
    print('  finished lmmnn, mse: %.3f' % res_lmmnn.metric_mse)
    res.loc[next(counter)] = [i, 'gaussian', 'lmmnn', res_lmmnn.metric_mse_no_re, res_lmmnn.metric_mse,
                              res_lmmnn.sigmas[0], res_lmmnn.sigmas[1][0], res_lmmnn.sigmas[2][0], res_lmmnn.sigmas[2][1], res_lmmnn.sig_ratio,
                              res_lmmnn.nll_tr, res_lmmnn.nll_te, res_lmmnn.n_epochs, res_lmmnn.time]
    for fit_dist in distributions:
        fit_dist = get_distribution(fit_dist)
        print(f'  started copnn with marginal: {fit_dist}...')
        res_copnn = run_regression(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols,
            batch, epochs, patience, n_neurons, dropout, activation, 'copnn',
            Z_non_linear, Z_embed_dim_pct, mode, y_type, n_sig2bs, n_sig2bs_spatial, est_cors,
            dist_matrix, time2measure_dict, spatial_embed_neurons, resolution, verbose,
            log_params, idx, shuffle, fit_dist, b_true, sample_n_train = sample_n_train)
        print(f'  finished copnn dist {fit_dist}, mse: {res_copnn.metric_mse: .3f}')
        res.loc[next(counter)] = [i, fit_dist, 'copnn', res_copnn.metric_mse_no_re, res_copnn.metric_mse,
                                  res_copnn.sigmas[0], res_copnn.sigmas[1][0], res_copnn.sigmas[2][0], res_copnn.sigmas[2][1], res_copnn.sig_ratio,
                                  res_copnn.nll_tr, res_copnn.nll_te, res_copnn.n_epochs, res_copnn.time]
    res.to_csv(f'drive/MyDrive/copnn_experiments/{file_name}')

In [None]:
if not pred_unknown_clusters:
  for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index].copy(), X.loc[test_index].copy(), y[train_index], y[test_index]
    # scaler = StandardScaler()
    # X_train[x_cols_to_scale] = scaler.fit_transform(X_train[x_cols_to_scale])
    # X_test[x_cols_to_scale] = scaler.transform(X_test[x_cols_to_scale])
    iterate_reg_types(X_train, X_test, y_train, y_test, counter, i, verbose)
else:
  for i, (train_clusters, test_clusters) in enumerate(kf.split(range(q_spatial))):
    print('iteration %d' % i)
    X_train, X_test = X[X['z0'].isin(train_clusters)], X[X['z0'].isin(test_clusters)]
    train_index, test_index = X_train.index, X_test.index
    y_train, y_test = y[train_index], y[test_index]
    print(X_train.shape[0], X_test.shape[0])
    print(y_train.shape[0], y_test.shape[0])
    print(X_train['z0'].unique().shape, X_test['z0'].unique().shape)
    # scaler = StandardScaler()
    # X_train[x_cols_to_scale] = scaler.fit_transform(X_train[x_cols_to_scale])
    # X_test[x_cols_to_scale] = scaler.transform(X_test[x_cols_to_scale])
    iterate_reg_types(X_train, X_test, y_train, y_test, counter, i, verbose)