# Synthetic regression experiments

Convex regression experiments on synthetic problems.<br/>
See the [Notebook parameters](#__synthetic_notebook_params__) cell for the settings.

In [None]:
import os
import sys
import time
import traceback

project_path = os.path.abspath('.' if 'requirements.txt' in os.listdir() else '..')
if project_path not in sys.path:
    sys.path.append(project_path)
print('project_path: {}'.format(project_path))

In [None]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from joblib import Parallel, delayed, Memory
from collections import OrderedDict
from IPython.display import display

from common.util import set_random_seed, eprint
from notebooks.logging_helper import logging_setup, info
logging_setup()

## Notebook parameters <a class="anchor" id="__synthetic_notebook_params__"></a>
The next cell is tagged by <code>parameters</code> for [papermill](https://papermill.readthedocs.io).

In [None]:
experiment_id = '_MISSING_ID'  # Name your experiment here!
loss = 'l2'  # 'l1', 'l2'
target_func = 'l1_quad'
#    'linear': linear function
#    'symm_l1': symmetric L1 norm (even, convex)
#    'trunc_l1': truncated L1 norm (convex)
#    'symm_quad': symmetric quadratic function (even, convex)
#    'trunc_quad': truncated quadratic function (convex)
covariate_distr = 'full_dim_normal'
#    'full_dim_normal[:std=1.0]': full dimensional normal distribution
#    'full_dim_uniform[:max=2.0][:min=-2.0]': full dimensional uniform distribution
#    'embed_uniform[:low_d=3][:meas_noise_std=0.1][:max=3.0][:min=-3.0]':
#        uniform random variable linearly embedded into a larger space with Gaussian measurement noise
#    'poly_uniform[:meas_noise_std=0.1][:max=1.0][:min=-1.0]':
#        uniform random variable with polynomial expansion and Gaussian measurement noise
observation_noise = 'normal'
#    'normal[:std=0.3]': Gaussian distribution
#    'rademacher': Rademacher distribution
global_random_seed = None  # nonnegative integer, setting under 10000 turns on caching
domain_dims = '3,5'  # domain dimensions
nsamples = '100,250'  # number of samples
nruns = 3  # number of experiment runs
ntestsamples = int(1e6)  # number of test samples to generate
parallel_nworkers = 1  # maximum number of parallel workers (make sure you have enough RAM too)
parallel_backend = 'multiprocessing'

In [None]:
def get_int_tuple(param):
    if isinstance(param, str):
        return tuple([int(v) for v in param.split(',')])
    elif isinstance(param, int):
        return (param,)
    return param

if global_random_seed is not None:
    global_random_seed = int(global_random_seed)
domain_dims = get_int_tuple(domain_dims)
nsamples = get_int_tuple(nsamples)
nruns = int(nruns)
ntestsamples = int(ntestsamples)
parallel_nworkers = int(parallel_nworkers)

In [None]:
seed_limit = 1e6
if global_random_seed is None:
    global_random_seed = 10000 + int(np.round((time.time() % 1) * seed_limit))
set_random_seed(global_random_seed)
setup_random_seed = np.random.randint(seed_limit)
data_random_seed = np.random.randint(seed_limit)
training_random_seed = np.random.randint(seed_limit)
testing_random_seed = np.random.randint(seed_limit)
info('random seeds, global:{}, setup:{}, data:{}, training:{}, testing:{}'.format(
    global_random_seed, setup_random_seed, data_random_seed,
    training_random_seed, testing_random_seed,
))

## Estimators

In [None]:
set_random_seed(setup_random_seed)
estimators = OrderedDict()

def get_estimator(estimator_name):
    return estimators[estimator_name]

In [None]:
# Ordinary Least-Squares estimator
from common.ols import OLSEstimator
estimators['OLS'] = OLSEstimator()

In [None]:
# # LSPA
# from algorithm.lspa.lspa import LSPAEstimator
# estimators['LSPA'] = LSPAEstimator(train_args={'ncenters': 'n**(d/(d+4))', 'nrestarts': 'd', 'nfinalsteps': 'n'})

In [None]:
# # CNLS
# from algorithm.cnls.cnls import CNLSEstimator
# estimators['CNLS_star'] = CNLSEstimator(train_args={'use_L': True})
# estimators['CNLS_ln'] = CNLSEstimator(train_args={'use_L': True, 'override_L': 'np.log(n)'})

In [None]:
# Convex Adaptive Partitioning (CAP)
from algorithm.cap.cap import CAPEstimator
estimators['CAP'] = CAPEstimator()
# estimators['FastCAP'] = CAPEstimator(train_args={'nranddirs': 5})

In [None]:
# # PCNLS with random Voronoi partition
# from algorithm.pcnls.pcnls_voronoi import PCNLSVoronoiEstimator
# estimators['PCNLS-Voronoi'] = PCNLSVoronoiEstimator()

In [None]:
# Adaptive Max-Affine Partitioning (AMAP)
from algorithm.amap.amap import AMAPEstimator
estimators['AMAP'] = AMAPEstimator()

In [None]:
# # APCNLS
from algorithm.apcnls.apcnls import APCNLSEstimator
estimators['APCNLS_star'] = APCNLSEstimator(train_args={'use_L': True})
estimators['APCNLS_ln'] = APCNLSEstimator(train_args={'use_L': True, 'override_L': 'np.log(n)'})
# estimators['APCNLS_reg'] = APCNLSEstimator(train_args={'use_L': False, 'L_regularizer': 'AUTO'})

### Non-convex regressors

In [None]:
# !pip install xgboost
from algorithm.external.xgboost import XgbEstimator
estimators['XGB'] = XgbEstimator(objective='reg:absoluteerror') if loss == 'l1' else XgbEstimator()

In [None]:
# !pip install scikit-learn
from algorithm.external.random_forest import RandomForestEstimator
estimators['RF'] = RandomForestEstimator()

In [None]:
# # !pip install scikit-learn
# from algorithm.external.nearest_neighbors import NearestNeighborsEstimator
# estimators['KNN+'] = NearestNeighborsEstimator(n_neighbors='AFPC', cv=5, afpc_ntrials=10)
# estimators['KNN*'] = NearestNeighborsEstimator(n_neighbors='n**(d/(2+d))', cv=5)

In [None]:
# # !pip install scikit-learn scikit-fda
# from algorithm.external.kernel_regression import KernelRegEstimator
# estimators['kreg_nor'] = KernelRegEstimator('normal')
# estimators['kreg_epa'] = KernelRegEstimator('epanechnikov')
# estimators['kreg_tri'] = KernelRegEstimator('tri_weight')

## Caching

In [None]:
from common.cache import ResultCache
result_cache = ResultCache(
    is_enabled=(global_random_seed < 10000), # caching is pointless without manual random seed setting
    project_path=project_path,
    experiment_id=experiment_id,
)
print(f'is_caching_enabled: {result_cache.is_enabled()}')
output_dir = None
if result_cache.is_enabled():
    output_dir = os.path.join(result_cache.cache_dir,
                              f'stats-seed{global_random_seed}-r{nruns}'
                              + '-n' + ','.join([str(n) for n in nsamples]))
    os.makedirs(output_dir, exist_ok=True)
    print(f'output_dir: {output_dir}')

## Problem setting

In [None]:
L = np.inf  # Lipschitz limit (can be set as a function to measure L on the union of the training and test sets)
L_scaler = 1.0  # multiplying L (makes sense when L is measured on the data)
X_mean = 0.0

In [None]:
from common.experiment import loss_l1, loss_l2, loss_linf

report_loss_name = loss
stat_losses = {'l1': loss_l1, 'l2': loss_l2, 'linf': loss_linf}

#### Target function

In [None]:
if target_func == 'linear':
    def fstar(X):
        return np.sum(X, axis=1)
    L = 1.0
elif target_func == 'symm_l1':
    def fstar(X):
        return np.sum(np.abs(X), axis=1)
    def L_func(X):
        return max(np.linalg.norm(np.sign(X), ord=2, axis=1))
    L = L_func
elif target_func == 'trunc_l1':
    def fstar(X):
        return np.sum(np.abs(np.maximum(X, 0.0)), axis=1)
    def L_func(X):
        return max(np.linalg.norm(np.sign(np.maximum(X, 0.0)), ord=2, axis=1))
    L = L_func
elif target_func == 'symm_quad':
    def fstar(X):
        return 0.5 * np.sum(np.square(X), axis=1)
    def L_func(X):
        return max(np.linalg.norm(X, ord=2, axis=1))
    L = L_func
elif target_func == 'trunc_quad':
    def fstar(X):
        return 0.5 * np.sum(np.square(np.maximum(X, 0.0)), axis=1)
    def L_func(X):
        return max(np.linalg.norm(np.maximum(X, 0.0), ord=2, axis=1))
    L = L_func
elif target_func == 'l1_quad':
    def fstar(X):
        return (
            np.sum(np.abs(np.maximum(1.0-X, 0.0)), axis=1)
            + np.sum(np.square(np.maximum(X-1.0, 0.0)), axis=1)
        )
    def L_func(X):
        return max(max(np.linalg.norm(np.sign(np.maximum(1.0-X, 0.0)), ord=2, axis=1)),
                   2.0*max(np.linalg.norm(np.maximum(X-1.0, 0.0), ord=2, axis=1)))
    L = L_func
else:
    raise Exception(f'Not supported target_func: {target_func}!')

#### Covariate distribution

In [None]:
covariate_distr_name = covariate_distr.split(':', 2)[0]
if covariate_distr_name == 'full_dim_normal':
    covariate_std = 1.0 if ':' not in covariate_distr else float(covariate_distr.split(':', 2)[1])
    assert covariate_std >= 0.0 

    def sample_X(n, d):
        return X_mean + np.random.randn(n, d) * covariate_std
elif covariate_distr_name == 'full_dim_uniform':
    covariate_max = 2.0 if ':' not in covariate_distr else float(covariate_distr.split(':', 2)[1])
    covariate_min = -covariate_max if covariate_distr.count(':') < 2 else float(covariate_distr.split(':', 3)[2])
    assert covariate_min < covariate_max

    def sample_X(n, d):
        return X_mean + np.random.rand(n, d) * (covariate_max - covariate_min) + covariate_min
elif covariate_distr_name == 'embed_uniform':
    low_d = 3 if ':' not in covariate_distr else int(covariate_distr.split(':', 2)[1])
    measurement_noise_std = 0.1 if covariate_distr.count(':') < 2 else float(covariate_distr.split(':', 3)[2])
    covariate_max = 3.0 if covariate_distr.count(':') < 3 else float(covariate_distr.split(':', 4)[3])
    covariate_min = -covariate_max if covariate_distr.count(':') < 4 else float(covariate_distr.split(':', 5)[4])
    assert low_d >= 1
    assert measurement_noise_std >= 0.0
    assert covariate_min < covariate_max

    def sample_X(n, d):
        X = np.random.randn(n, d) * measurement_noise_std
        X[:, :low_d] = np.random.rand(n, low_d) * (covariate_max - covariate_min) + covariate_min
        return X + X_mean
elif covariate_distr_name == 'poly_uniform':
    measurement_noise_std = 0.1 if ':' not in covariate_distr else float(covariate_distr.split(':', 2)[1])
    covariate_max = 1.0 if covariate_distr.count(':') < 2 else float(covariate_distr.split(':', 3)[2])
    covariate_min = -covariate_max if covariate_distr.count(':') < 3 else float(covariate_distr.split(':', 4)[3])
    assert measurement_noise_std >= 0.0
    assert covariate_min < covariate_max

    def sample_X(n, d):
        X = np.random.randn(n, d) * measurement_noise_std
        Z = np.random.rand(n) * (covariate_max - covariate_min) + covariate_min
        for power in range(d):
            X[:, power] += Z**power
        return X + X_mean
else:
    raise Exception(f'Not supported covariate_distr: {covariate_distr}!')

#### Observation noise distribution

In [None]:
observation_noise_name = observation_noise.split(':', 2)[0]
if observation_noise_name == 'normal':
    observation_noise_std = 0.3 if ':' not in observation_noise else float(observation_noise.split(':', 2)[1])

    def sample_noise(n):
        return np.random.randn(n) * observation_noise_std
elif observation_noise_name == 'rademacher':
    def sample_noise(n):
        return 2.0 * (np.random.randint(0, 2, n) - 0.5)
else:
    raise Exception(f'Not supported observation_noise: {observation_noise}!')

## Data

In [None]:
from common.experiment import get_random_seed_offset

def get_data(d, n, run, data_random_seed):
    seed = data_random_seed + get_random_seed_offset(d, n, run)
    print(f'seed: {seed}, d:{d}, n:{n}, run:{run}, data_random_seed:{data_random_seed}')
    set_random_seed(seed)

    X_train = sample_X(n, d)
    y_train_noiseless = fstar(X_train)
    y_train = y_train_noiseless + sample_noise(n)

    X_test = sample_X(ntestsamples, d)
    y_test = fstar(X_test)

    return X_train, y_train, X_test, y_test, y_train_noiseless

### AFPC statistics

In [None]:
from notebooks.afpc_stats import (
    get_afpc_stats, plot_partition_size, plot_partition_epsilon,
)

afpc_stats = get_afpc_stats(
    domain_dims=domain_dims, nsamples=nsamples, nruns=nruns,
    data_random_seed=data_random_seed, get_data_func=get_data,
    report_loss=stat_losses[report_loss_name],
)
print('\nData statistics:')
with pd.option_context('display.max_rows', None):
    display(afpc_stats)
if output_dir is not None:
    afpc_stats.to_csv(os.path.join(output_dir, 'data_stats.csv'))

for d in domain_dims:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
    plot_partition_size(ax1, d, nsamples, afpc_stats)
    plot_partition_epsilon(ax2, d, nsamples, afpc_stats)

## Training

In [None]:
from common.experiment import (
    calc_experiment_result,
    prepare_experiment_calc_funcs,
)

def run_experiment(d, n, estimator_name, run, data_random_seed, training_random_seed):
    result = calc_experiment_result(
        d=d, n=n, estimator_name=estimator_name, run=run,
        get_data_func=get_data, get_estimator_func=get_estimator,
        stat_losses=stat_losses, report_loss_name=report_loss_name, log_func=info,
        data_random_seed=data_random_seed, training_random_seed=training_random_seed,
        L=L, L_scaler=L_scaler,
    )
    return ((d, n, estimator_name, run), result)

delayed_funcs = prepare_experiment_calc_funcs(
    domain_dims=domain_dims, nsamples=nsamples, nruns=nruns, estimators=estimators,
    data_random_seed=data_random_seed, training_random_seed=training_random_seed,
    result_cache=result_cache, run_experiment_func=run_experiment,
)
try:
    results = OrderedDict(sorted(Parallel(n_jobs=parallel_nworkers, backend=parallel_backend)(delayed_funcs)))
except Exception:
    eprint(traceback.format_exc())
    time.sleep(3)
    raise
info('All results have been calculated.')

## Evaluation

In [None]:
skipped_estimators = ('OLS',)

In [None]:
from common.experiment import collect_estimator_stats

all_stats = OrderedDict()
for estimator_name in list(estimators.keys()):
    stats = collect_estimator_stats(estimator_name, results)
    print('\nestimator: {}'.format(estimator_name))
    all_stats[estimator_name] = stats
    with pd.option_context('display.max_rows', None):
        display(stats)

if output_dir is not None:
    for k, v in all_stats.items():
        v.to_csv(os.path.join(output_dir, f'stats-{k}.csv'))

In [None]:
from notebooks.evaluation import plot_standard_stats

for d in domain_dims:
    plot_standard_stats(
        all_stats=all_stats, report_loss_name=report_loss_name,
        d=d, skipped_estimators=skipped_estimators,
    )

### FVU (Fraction of Variance Unexplained) 

In [None]:
from common.experiment import collect_stats_by_name

fvu = pd.concat([
    collect_stats_by_name(all_stats, 'test_fvu__mean'),
    collect_stats_by_name(all_stats, 'test_fvu__std'),
], axis=1, keys=['mean', 'std']).swaplevel(0, 1, axis=1).sort_index(axis=1)

print('FVU(%):')
display(np.round(fvu * 100, decimals=1).T)