# setup

In [None]:
from copulae.input import generate_copula_net_input

In [None]:
import jax.numpy as jnp
import jax.scipy.stats as jss
import jax

import numpy as np

In [None]:
import pandas as pd
import scipy
from scipy.stats import bootstrap
import copy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

## utils

In [None]:
def add_train_random_noise(data, num_adds):
  new_data = np.random.rand(num_adds, data.shape[1])
  return np.concatenate((data, new_data), axis = 0)

In [None]:
def rank_normalization(X):
  X = copy.deepcopy(X)
  for z in X:
      ndata = z.shape[0]
      gap = 1./(ndata+1)
      nfeats = z.shape[1]
      for i in range(nfeats):
          z[:, i] = scipy.stats.rankdata(z[:, i], 'ordinal')*gap
  return X

In [None]:
def get_set(D_val, data_points):
  points = D_val
  points = jnp.expand_dims(points, axis=0)

  # PDF and CDF for X
  kde_x = jss.gaussian_kde(data_points[0], bw_method='silverman')
  density_x = kde_x.evaluate(points[0, 0, :])
  cumulative_x = jnp.array([kde_x.integrate_box_1d(-jnp.inf, p) for p in points[0, 0, :]])

  # PDF and CDF for Y
  kde_y = jss.gaussian_kde(D[1], bw_method='silverman')
  density_y = kde_y.evaluate(points[0, 1, :])
  cumulative_y = jnp.array([kde_y.integrate_box_1d(-jnp.inf, p) for p in points[0, 1, :]])

  I_pdf = density_x.T * density_y.T
  I_pdf = jnp.expand_dims(I_pdf, axis=0)
  cdf_xy = jnp.array((cumulative_x, cumulative_y))
  cdf_xy = jnp.expand_dims(cdf_xy, axis=0)

  del density_x
  del density_y
  del cumulative_x
  del cumulative_y

  return points, I_pdf, cdf_xy

## real data

In [None]:
!git clone https://github.com/yutingng/gen-AC.git

fatal: destination path 'gen-AC' already exists and is not an empty directory.


In [None]:
class Boston():
  def __init__(self):
    # read
    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep = "\s+", skiprows = 22, header = None)
    X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    y = raw_df.values[1::2, 2]

    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, random_state = 142857)
    X_train = np.concatenate((X_train, y_train[:, None]), axis = 1)
    X_test  = np.concatenate((X_test, y_test[:, None]), axis = 1)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # noise
    X_train = add_train_random_noise(X_train, int(X_train.shape[0]*0.01))

    # 2d
    train_data = X_train[:, [0, 13]]
    test_data = X_test[:, [0, 13]]

    # flip
    train_data[:, 0] = 1 - train_data[:, 0]
    test_data[:, 0] = 1 - test_data[:, 0]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

  raw_df = pd.read_csv(data_url, sep = "\s+", skiprows = 22, header = None)


In [None]:
class INTC_MSFT():
  def __init__(self):
    # read
    intel_f = open('gen-AC/data/raw/INTC_MSFT_GE/INTEL.data', 'r')
    intel = np.array(list(map(float, intel_f.readlines())))

    ms_f = open('gen-AC/data/raw/INTC_MSFT_GE/MS.data', 'r')
    ms = np.array(list(map(float, ms_f.readlines())))

    ge_f = open('gen-AC/data/raw/INTC_MSFT_GE/GE.data', 'r')
    ge = np.array(list(map(float, ge_f.readlines())))

    # split
    X = np.concatenate((intel[:, None], ms[:, None]), axis = 1)
    X_train, X_test, _, _ = train_test_split(X, X, shuffle = True, random_state = 142857)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # 2d, noise
    train_data = X_train[:, [0, 1]]
    train_data = add_train_random_noise(train_data, int(train_data.shape[0]*0.01))
    test_data = X_test[:, [0, 1]]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)


In [None]:
class GOOG_FB():
  def __init__(self):
    # read
    goog_f = open('gen-AC/data/raw/FB_GOOG/goog/close.vals', 'r')
    goog = np.array(list(map(float, goog_f.readlines())))

    fb_f = open('gen-AC/data/raw/FB_GOOG/fb/close.vals', 'r')
    fb = np.array(list(map(float, fb_f.readlines())))

    # split
    X = np.concatenate((goog[:, None], fb[:, None]), axis = 1)
    X_train, X_test, _, _ = train_test_split(X, X, shuffle=True, random_state=142857)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # 2d, noise
    train_data = X_train[:, [0, 1]]
    train_data = add_train_random_noise(train_data, int(train_data.shape[0]*0.01))
    test_data = X_test[:, [0, 1]]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

## synthetic data

In [None]:
def generate_gaussian(rho, sample_size=2000):
  mean = np.zeros(2)
  E = np.zeros(shape=(2, 2)) + rho
  E[0, 0] = 1
  E[1, 1] =1

  D = np.random.multivariate_normal(mean=mean, cov=E, size=(sample_size, )).T

  # Generating Train and test data
  shuf_indexes = np.random.permutation(sample_size)

  train_p = 0.75
  n_train = int(D.shape[1] * train_p)
  n_test = D.shape[1] - n_train

  train_D = D[:, shuf_indexes[:n_train]]
  test_D = D[:, shuf_indexes[n_train:]]

  return train_D, test_D

class Gauss():
  def __init__(self, rho):
    train_D, test_D = generate_gaussian(rho)

    train_D = scaler.fit_transform(train_D.T).T
    test_D = scaler.fit_transform(test_D.T).T

    self.train_y = train_D[1, :].reshape(-1, 1)
    self.train_x = train_D[0, :].reshape(-1, 1)
    self.validation_y = test_D[1, :].reshape(-1, 1)
    self.validation_x = test_D[0, :].reshape(-1, 1)

In [None]:
# Marshal and Olkin
def clayton_sample(theta):
  alpha = 1 / theta
  beta = 1
  V = np.random.gamma(shape=alpha, scale=beta)
  R = np.random.exponential(scale=1, size=2)
  t = R / V
  U = (1 + t) ** (-1/theta)
  return U

# Generate Clayton Copula with N(0,1) margins
def generate_clayton_sample(theta, sample_size=2000):
  X = []
  Y = []
  for _ in range(sample_size):
    U = clayton_sample(theta)
    X.append(scipy.stats.norm.ppf(U[0]))
    Y.append(scipy.stats.norm.ppf(U[1]))

  D = np.concatenate((X, Y)).reshape((2, -1))

  # Generating Train and test data
  shuf_indexes = np.random.permutation(sample_size)

  train_p = 0.75
  n_train = int(D.shape[1] * train_p)
  n_test = D.shape[1] - n_train

  train_D = D[:, shuf_indexes[:n_train]]
  test_D = D[:, shuf_indexes[n_train:]]

  return train_D, test_D

class Clayton():
  def __init__(self, theta):
    train_D, test_D = generate_clayton_sample(theta)

    train_D = scaler.fit_transform(train_D.T).T
    test_D = scaler.fit_transform(test_D.T).T

    self.train_y = train_D[1, :].reshape(-1, 1)
    self.train_x = train_D[0, :].reshape(-1, 1)
    self.validation_y = test_D[1, :].reshape(-1, 1)
    self.validation_x = test_D[0, :].reshape(-1, 1)

In [None]:
# Marshal and Olkin
def frank_sample(theta):
  p = 1 - np.exp(-theta)
  V = scipy.stats.logser.rvs(p)
  R = np.random.exponential(scale=1, size=2)
  t = R / V
  U = -1/theta * np.log( 1 - ( (1 - np.exp(-theta)) * (np.exp(-t)) ) )
  return U

# Generate Frank Copula with N(0,1) margins
def generate_frank_sample(theta, sample_size=2000):
  X = []
  Y = []
  for _ in range(sample_size):
    U = frank_sample(theta)
    X.append(scipy.stats.norm.ppf(U[0]))
    Y.append(scipy.stats.norm.ppf(U[1]))

  D = np.concatenate((X, Y)).reshape((2, -1))

  # Generating Train and test data
  shuf_indexes = np.random.permutation(sample_size)

  train_p = 0.75
  n_train = int(D.shape[1] * train_p)
  n_test = D.shape[1] - n_train

  train_D = D[:, shuf_indexes[:n_train]]
  test_D = D[:, shuf_indexes[n_train:]]

  return train_D, test_D

class Frank():
  def __init__(self, theta):
    train_D, test_D = generate_frank_sample(theta)

    train_D = scaler.fit_transform(train_D.T).T
    test_D = scaler.fit_transform(test_D.T).T

    self.train_y = train_D[1, :].reshape(-1, 1)
    self.train_x = train_D[0, :].reshape(-1, 1)
    self.validation_y = test_D[1, :].reshape(-1, 1)
    self.validation_x = test_D[0, :].reshape(-1, 1)

# get ds

In [None]:
np.random.seed(30091985)
key = jax.random.PRNGKey(30091985)

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


In [None]:
# data_loader = Boston()
# ds = 'boston'
# data_loader = INTC_MSFT()
# ds = 'intcmsft'
# data_loader = GOOG_FB()
# ds = 'googfb'

# ds = 'gauss1'
# data_loader = Gauss(rho = 0.1)
# ds = 'gauss5'
# data_loader = Gauss(rho = 0.5)
# ds = 'gauss9'
# data_loader = Gauss(rho = 0.9)
# ds = 'clayton1'
# data_loader = Clayton(theta = 1)
# ds = 'clayton5'
# data_loader = Clayton(theta = 5)
# ds = 'clayton10'
# data_loader = Clayton(theta = 10)
# ds = 'frank1'
# data_loader = Frank(theta = 1)
# ds = 'frank5'
# data_loader = Frank(theta = 5)
ds = 'frank10'
data_loader = Frank(theta = 10)

D = np.array([data_loader.train_x, data_loader.train_y])[:, :, 0]
D_val = np.array([data_loader.validation_x, data_loader.validation_y])[:, :, 0]

TrainingTensors = generate_copula_net_input(
    D=D,
    bootstrap=False
)

In [None]:
# _, _, cdf_xy_trn = get_set(D, TrainingTensors.X_batches[0])
# np.savetxt('data/{}/trn.csv'.format(ds), cdf_xy_trn[0, :, :].T, delimiter = ',')

In [None]:
# _, _, cdf_xy_tst = get_set(D_val, TrainingTensors.X_batches[0])
# np.savetxt('data/{}/tst.csv'.format(ds), cdf_xy_tst[0, :, :].T, delimiter = ',')

# eval baseline

In [None]:
baselines = ['par', # VineCopula
             'bern', 'T', 'TLL1', 'TLL2', 'TLL2nn', 'MR', 'beta', # kdecopula
             'pbern', 'pspl1', 'pspl2'] # penRvine

In [None]:
for baseline in baselines:
    copula_density = np.genfromtxt('data/{}/{}_yhat.csv'.format(ds, baseline), delimiter = ',')

    _, I_pdf, _ = get_set(D_val, TrainingTensors.X_batches[0])
    points_density = copula_density * I_pdf

    res = bootstrap(yhat, np.mean)
    # print(baseline, np.mean(yhat), res.confidence_interval)
    print(np.mean(yhat), res.confidence_interval[0], res.confidence_interval[1])

-1.2809068 -1.432765046839087 -1.1006814066505055
-1.2667893 -1.4141365496082787 -1.0971292123451855
-1.2481375 -1.410240108228669 -1.0506701263538587
-1.2272695 -1.3952562106877384 -1.0299757733825359
-1.2250094 -1.3899939287799046 -1.0175329680341898
-1.2551966 -1.4131092662080398 -1.0612081362407617
-1.2488296 -1.4038185923599946 -1.0602432681881897
-1.2517431 -1.4076652031801824 -1.0646107590062521
-1.2657864 -1.417002078811819 -1.088006827871696
-1.1772825 -1.3588010272679127 -0.9555101304700729
-1.2425157 -1.4062757297373993 -1.0501908768900723
