# setup

https://github.com/TimCJanke/igc

In [1]:
import pandas as pd
import scipy
import copy
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.express as px

from scipy import stats

In [2]:
!git clone https://github.com/TimCJanke/igc

Cloning into 'igc'...
remote: Enumerating objects: 124, done.[K
remote: Counting objects: 100% (124/124), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 124 (delta 50), reused 110 (delta 41), pack-reused 0[K
Receiving objects: 100% (124/124), 21.05 MiB | 22.41 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [3]:
%cd igc

/content/igc


In [4]:
!pip install pybind11==2.6.2 pyvinecopulib==0.5.5 setuptools-scm==6.0.1

Collecting pybind11==2.6.2
  Downloading pybind11-2.6.2-py2.py3-none-any.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.4/191.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyvinecopulib==0.5.5
  Downloading pyvinecopulib-0.5.5.tar.gz (23.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setuptools-scm==6.0.1
  Downloading setuptools_scm-6.0.1-py3-none-any.whl (27 kB)
Building wheels for collected packages: pyvinecopulib
  Building wheel for pyvinecopulib (setup.py) ... [?25l[?25hdone
  Created wheel for pyvinecopulib: filename=pyvinecopulib-0.5.5-cp310-cp310-linux_x86_64.whl size=14556919 sha256=c57f9dc35c89feb973b04e7dd8c804ac961315fc3f23dce17f8fba0daf82cf63
  Stored in directory: /root/.cache/pip/wheels/bf/91/96/45e1fb1955095b8a057d148acc15a38654e81c76793705e477
Successfully built pyvin

In [5]:
from experiments_utils import run_experiment
from experiments_utils import random_bicop, get_pvcopfamily, beta_copula_cdf, emp_cdf, gaussian_mixture_copula
import pyvinecopulib as pv
from models.igc import ImplicitGenerativeCopula
from scipy.stats import bootstrap

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity

def eval_prob(data_models, data_test, bw, n_eval=10000):
    # evaluate likelihood of test data under KDE based likelihood from trained models
    nll={}
    if bw is None:
        grid_cv = GridSearchCV(KernelDensity(), param_grid={"bandwidth": np.logspace(-1.0,1.0,10)}) # use CV to find best bandwidth on the test data
        grid_cv.fit(data_test)
        bw_opt = grid_cv.best_params_["bandwidth"]
        print(bw_opt)

    elif isinstance(bw, (list, tuple, np.ndarray)):
        grid_cv = GridSearchCV(KernelDensity(), param_grid={"bandwidth": bw}) # use CV to find best bandwidth on the test data
        grid_cv.fit(data_test)
        bw_opt = grid_cv.best_params_["bandwidth"]
        print(bw_opt)

    elif isinstance(bw, float):
        bw_opt = bw

    for key_i, y_i in data_models.items():
        kde_model = KernelDensity(bandwidth=bw_opt).fit(y_i)
        yhat = kde_model.score_samples(data_test[0:n_eval]) # compute likelihood of test data under KDE
    return yhat

# data
https://github.com/yutingng/gen-AC.git

In [7]:
import pandas as pd
import scipy
import copy

from sklearn.model_selection import train_test_split

In [8]:
def add_train_random_noise(data, num_adds):
  new_data = np.random.rand(num_adds, data.shape[1])
  return np.concatenate((data, new_data), axis = 0)

In [9]:
def rank_normalization(X):
  X = copy.deepcopy(X)
  for z in X:
      ndata = z.shape[0]
      gap = 1./(ndata+1)
      nfeats = z.shape[1]
      for i in range(nfeats):
          z[:, i] = scipy.stats.rankdata(z[:, i], 'ordinal')*gap
  return X

In [10]:
!git clone https://github.com/yutingng/gen-AC.git

Cloning into 'gen-AC'...
remote: Enumerating objects: 466, done.[K
remote: Counting objects: 100% (466/466), done.[K
remote: Compressing objects: 100% (339/339), done.[K
remote: Total 466 (delta 159), reused 421 (delta 123), pack-reused 0[K
Receiving objects: 100% (466/466), 10.28 MiB | 4.38 MiB/s, done.
Resolving deltas: 100% (159/159), done.


In [11]:
class Boston():
  def __init__(self):
    # read
    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep = "\s+", skiprows = 22, header = None)
    X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    y = raw_df.values[1::2, 2]

    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, random_state = 142857)
    X_train = np.concatenate((X_train, y_train[:, None]), axis = 1)
    X_test  = np.concatenate((X_test, y_test[:, None]), axis = 1)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # noise
    X_train = add_train_random_noise(X_train, int(X_train.shape[0]*0.01))

    # 2d
    train_data = X_train[:, [0, 13]]
    test_data = X_test[:, [0, 13]]

    # flip
    train_data[:, 0] = 1 - train_data[:, 0]
    test_data[:, 0] = 1 - test_data[:, 0]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

In [12]:
class INTC_MSFT():
  def __init__(self):
    # read
    intel_f = open('gen-AC/data/raw/INTC_MSFT_GE/INTEL.data', 'r')
    intel = np.array(list(map(float, intel_f.readlines())))

    ms_f = open('gen-AC/data/raw/INTC_MSFT_GE/MS.data', 'r')
    ms = np.array(list(map(float, ms_f.readlines())))

    ge_f = open('gen-AC/data/raw/INTC_MSFT_GE/GE.data', 'r')
    ge = np.array(list(map(float, ge_f.readlines())))

    # split
    X = np.concatenate((intel[:, None], ms[:, None]), axis = 1)
    X_train, X_test, _, _ = train_test_split(X, X, shuffle = True, random_state = 142857)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # 2d, noise
    train_data = X_train[:, [0, 1]]
    train_data = add_train_random_noise(train_data, int(train_data.shape[0]*0.01))
    test_data = X_test[:, [0, 1]]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

In [13]:
class GOOG_FB():
  def __init__(self):
    # read
    goog_f = open('gen-AC/data/raw/FB_GOOG/goog/close.vals', 'r')
    goog = np.array(list(map(float, goog_f.readlines())))

    fb_f = open('gen-AC/data/raw/FB_GOOG/fb/close.vals', 'r')
    fb = np.array(list(map(float, fb_f.readlines())))

    # split
    X = np.concatenate((goog[:, None], fb[:, None]), axis = 1)
    X_train, X_test, _, _ = train_test_split(X, X, shuffle=True, random_state=142857)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # 2d, noise
    train_data = X_train[:, [0, 1]]
    train_data = add_train_random_noise(train_data, int(train_data.shape[0]*0.01))
    test_data = X_test[:, [0, 1]]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

# boston

In [14]:
data_loader = Boston()
train_D = np.array([data_loader.train_x, data_loader.train_y])[:, :, 0]
test_D = np.array([data_loader.validation_x, data_loader.validation_y])[:, :, 0]

all_scores, data_models_v, data_models_y, models_joint, models_margins = run_experiment(train_D.T,
                                                                        test_D.T,
                                                                        evaluate_likelihood=True,
                                                                        GaussCop=False,
                                                                        VineCop=False,
                                                                        GMMNCop=False,
                                                                        GMMNFull=False,
                                                                        GAN = False,
                                                                        IGC=True,
                                                                        options_nn={"n_neurons": 100, "n_layers": 2, "n_samples_train": 200},
                                                                        options_nn_training={"epochs": 10000, "batch_size": 100},
                                                                        bw_kde = 0.15)
all_scores

Training models ...


100%|██████████| 10000/10000 [06:27<00:00, 25.80it/s]


Done.

Sampling data for evalutation...
Done.

Computing evaluation metrics...
LogLikelihood data space...
All done.



Unnamed: 0,NLL_dataspace
igc,0.065757


In [15]:
yhat = -eval_prob(data_models_y, test_D.T, bw=0.15)
nll = np.mean(yhat)
nll

0.06575687567575011

In [16]:
res = bootstrap((yhat,), np.mean)
rmean = res.bootstrap_distribution.mean()
low = res.confidence_interval.low
high = res.confidence_interval.high
dev1 = high - rmean
dev2 = rmean - low
dev = dev1 if dev1 > dev2 else dev2
rmean, res.standard_error, res.confidence_interval
print(f'NLL: {nll:.4f} RMean: {rmean:.4f} IC: ({low:.4f},{high:.4f}) Notation: {rmean:.4f}$\pm${dev:.4f}')

NLL: 0.0658 RMean: 0.0660 IC: (0.0137,0.1414) Notation: 0.0660$\pm$0.0754


# intc-msft

In [17]:
data_loader = INTC_MSFT()
train_D = np.array([data_loader.train_x, data_loader.train_y])[:, :, 0]
test_D = np.array([data_loader.validation_x, data_loader.validation_y])[:, :, 0]

all_scores, data_models_v, data_models_y, models_joint, models_margins = run_experiment(train_D.T,
                                                                        test_D.T,
                                                                        evaluate_likelihood=True,
                                                                        GaussCop=False,
                                                                        VineCop=False,
                                                                        GMMNCop=False,
                                                                        GMMNFull=False,
                                                                        GAN = False,
                                                                        IGC=True,
                                                                        options_nn={"n_neurons": 100, "n_layers": 2, "n_samples_train": 200},
                                                                        options_nn_training={"epochs": 10000, "batch_size": 100},
                                                                        bw_kde = 0.15)
all_scores

Training models ...


100%|██████████| 10000/10000 [18:44<00:00,  8.89it/s]


Done.

Sampling data for evalutation...
Done.

Computing evaluation metrics...
LogLikelihood data space...
All done.



Unnamed: 0,NLL_dataspace
igc,0.139643


In [18]:
yhat = -eval_prob(data_models_y, test_D.T, bw=0.15)
nll = np.mean(yhat)
nll

0.13964332222854506

In [19]:
res = bootstrap((yhat,), np.mean)
rmean = res.bootstrap_distribution.mean()
low = res.confidence_interval.low
high = res.confidence_interval.high
dev1 = high - rmean
dev2 = rmean - low
dev = dev1 if dev1 > dev2 else dev2
rmean, res.standard_error, res.confidence_interval
print(f'NLL: {nll:.4f} RMean: {rmean:.4f} IC: ({low:.4f},{high:.4f}) Notation: {rmean:.4f}$\pm${dev:.4f}')

NLL: 0.1396 RMean: 0.1397 IC: (0.1024,0.1830) Notation: 0.1397$\pm$0.0433


# goog-fb

In [20]:
data_loader = GOOG_FB()
train_D = np.array([data_loader.train_x, data_loader.train_y])[:, :, 0]
test_D = np.array([data_loader.validation_x, data_loader.validation_y])[:, :, 0]

all_scores, data_models_v, data_models_y, models_joint, models_margins = run_experiment(train_D.T,
                                                                        test_D.T,
                                                                        evaluate_likelihood=True,
                                                                        GaussCop=False,
                                                                        VineCop=False,
                                                                        GMMNCop=False,
                                                                        GMMNFull=False,
                                                                        GAN = False,
                                                                        IGC=True,
                                                                        options_nn={"n_neurons": 100, "n_layers": 2, "n_samples_train": 200},
                                                                        options_nn_training={"epochs": 10000, "batch_size": 100},
                                                                        bw_kde = 0.15)
all_scores

Training models ...


100%|██████████| 10000/10000 [18:29<00:00,  9.01it/s]


Done.

Sampling data for evalutation...
Done.

Computing evaluation metrics...
LogLikelihood data space...
All done.



Unnamed: 0,NLL_dataspace
igc,-0.302936


In [21]:
yhat = -eval_prob(data_models_y, test_D.T, bw=0.15)
nll = np.mean(yhat)
nll

-0.30293628700489994

In [22]:
res = bootstrap((yhat,), np.mean)
rmean = res.bootstrap_distribution.mean()
low = res.confidence_interval.low
high = res.confidence_interval.high
dev1 = high - rmean
dev2 = rmean - low
dev = dev1 if dev1 > dev2 else dev2
rmean, res.standard_error, res.confidence_interval
print(f'NLL: {nll:.4f} RMean: {rmean:.4f} IC: ({low:.4f},{high:.4f}) Notation: {rmean:.4f}$\pm${dev:.4f}')

NLL: -0.3029 RMean: -0.3028 IC: (-0.3247,-0.2812) Notation: -0.3028$\pm$0.0219
