# setup
https://github.com/yutingng/gen-AC.git

In [1]:
!git clone https://github.com/yutingng/gen-AC

Cloning into 'gen-AC'...
remote: Enumerating objects: 466, done.[K
remote: Counting objects: 100% (466/466), done.[K
remote: Compressing objects: 100% (339/339), done.[K
remote: Total 466 (delta 159), reused 421 (delta 123), pack-reused 0[K
Receiving objects: 100% (466/466), 10.28 MiB | 17.20 MiB/s, done.
Resolving deltas: 100% (159/159), done.


In [2]:
%cd gen-AC

/content/gen-AC


In [3]:
import os
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt

from main import MixExpPhiStochastic, sampleStochastic, Copula
from train import load_data, load_data2

In [4]:
torch.set_default_tensor_type(torch.DoubleTensor)

In [5]:
from scipy.stats import bootstrap

# data
https://github.com/yutingng/gen-AC.git

In [6]:
import pandas as pd
import scipy
import copy

from sklearn.model_selection import train_test_split

In [7]:
def add_train_random_noise(data, num_adds):
  new_data = np.random.rand(num_adds, data.shape[1])
  return np.concatenate((data, new_data), axis = 0)

In [8]:
def rank_normalization(X):
  X = copy.deepcopy(X)
  for z in X:
      ndata = z.shape[0]
      gap = 1./(ndata+1)
      nfeats = z.shape[1]
      for i in range(nfeats):
          z[:, i] = scipy.stats.rankdata(z[:, i], 'ordinal')*gap
  return X

In [9]:
!git clone https://github.com/yutingng/gen-AC.git

Cloning into 'gen-AC'...
remote: Enumerating objects: 466, done.[K
remote: Counting objects: 100% (466/466), done.[K
remote: Compressing objects: 100% (339/339), done.[K
remote: Total 466 (delta 159), reused 421 (delta 123), pack-reused 0[K
Receiving objects: 100% (466/466), 10.28 MiB | 18.12 MiB/s, done.
Resolving deltas: 100% (159/159), done.


In [10]:
class Boston():
  def __init__(self):
    # read
    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep = "\s+", skiprows = 22, header = None)
    X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    y = raw_df.values[1::2, 2]

    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, random_state = 142857)
    X_train = np.concatenate((X_train, y_train[:, None]), axis = 1)
    X_test  = np.concatenate((X_test, y_test[:, None]), axis = 1)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # noise
    X_train = add_train_random_noise(X_train, int(X_train.shape[0]*0.01))

    # 2d
    train_data = X_train[:, [0, 13]]
    test_data = X_test[:, [0, 13]]

    # flip
    train_data[:, 0] = 1 - train_data[:, 0]
    test_data[:, 0] = 1 - test_data[:, 0]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

In [11]:
class INTC_MSFT():
  def __init__(self):
    # read
    intel_f = open('gen-AC/data/raw/INTC_MSFT_GE/INTEL.data', 'r')
    intel = np.array(list(map(float, intel_f.readlines())))

    ms_f = open('gen-AC/data/raw/INTC_MSFT_GE/MS.data', 'r')
    ms = np.array(list(map(float, ms_f.readlines())))

    ge_f = open('gen-AC/data/raw/INTC_MSFT_GE/GE.data', 'r')
    ge = np.array(list(map(float, ge_f.readlines())))

    # split
    X = np.concatenate((intel[:, None], ms[:, None]), axis = 1)
    X_train, X_test, _, _ = train_test_split(X, X, shuffle = True, random_state = 142857)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # 2d, noise
    train_data = X_train[:, [0, 1]]
    train_data = add_train_random_noise(train_data, int(train_data.shape[0]*0.01))
    test_data = X_test[:, [0, 1]]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

In [12]:
class GOOG_FB():
  def __init__(self):
    # read
    goog_f = open('gen-AC/data/raw/FB_GOOG/goog/close.vals', 'r')
    goog = np.array(list(map(float, goog_f.readlines())))

    fb_f = open('gen-AC/data/raw/FB_GOOG/fb/close.vals', 'r')
    fb = np.array(list(map(float, fb_f.readlines())))

    # split
    X = np.concatenate((goog[:, None], fb[:, None]), axis = 1)
    X_train, X_test, _, _ = train_test_split(X, X, shuffle=True, random_state=142857)

    # norm
    [X_train, X_test] = rank_normalization([X_train, X_test])

    # 2d, noise
    train_data = X_train[:, [0, 1]]
    train_data = add_train_random_noise(train_data, int(train_data.shape[0]*0.01))
    test_data = X_test[:, [0, 1]]

    self.train_y = train_data[:, 1].reshape(-1, 1)
    self.train_x = train_data[:, 0].reshape(-1, 1)
    self.validation_y = test_data[:, 1].reshape(-1, 1)
    self.validation_x = test_data[:, 0].reshape(-1, 1)

# param

In [13]:
optim_args = \
{
    'lr': 1e-5, # it is 1e-3 since torch.sum was used instead of torch.mean for loglikelihood
    'momentum': 0.9
}

num_epochs = 10000
batch_size = 200
chkpt_freq = 500

identifier = 'None'

In [14]:
def expt(train_data, test_data,
         net,
         optimizer,
         optim_args,
         identifier,
         num_epochs=1000,
         batch_size=100,
         chkpt_freq=50,
         ):
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=1000000, shuffle=True)

    train_loss_per_epoch = []

    for epoch in range(num_epochs):
        loss_per_minibatch = []
        for i, data in enumerate(train_loader, 0):
            optimizer.zero_grad()

            net.phi.resample_M(100)

            d = data.detach().clone()
            p = net(d, mode='pdf2')

            scaleloss = torch.square(torch.mean(net.phi.M)-1)
            logloss = -torch.sum(torch.log(p))
            reg_loss = logloss+scaleloss
            reg_loss.backward()

            loss_per_minibatch.append((logloss/p.numel()).detach().numpy())
            optimizer.step()

        train_loss_per_epoch.append(np.mean(loss_per_minibatch))
        if True:
            net.phi.resample_M(1000)

            samples = sampleStochastic(net, 2, 1000).detach()
            # plt.scatter(samples[:, 0], samples[:, 1])
            # plt.axis("square")
            # plt.clf()

        for i, data in enumerate(test_loader, 0):
            net.zero_grad()
            net.phi.resample_M(1000)
            d = data.detach().clone()
            p = net(d, mode='pdf2')
            logloss = -torch.mean(torch.log(p))

        print('Epoch %s: Train %s, Val %s' %
          (epoch, train_loss_per_epoch[-1], logloss.item()))
    return p

# boston

In [15]:
phi = MixExpPhiStochastic()
net = Copula(phi)
optimizer = optim.SGD(net.parameters(), optim_args['lr'], optim_args['momentum'])

In [16]:
data_loader = Boston()
train_D = np.array([data_loader.train_x, data_loader.train_y])[:, :, 0]
test_D = np.array([data_loader.validation_x, data_loader.validation_y])[:, :, 0]

In [17]:
train_data = torch.tensor(train_D).requires_grad_(True).T
test_data = torch.tensor(test_D).requires_grad_(True).T

In [18]:
yhat = expt(train_data, test_data, net, optimizer, optim_args, identifier, num_epochs, batch_size, chkpt_freq)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 5000: Train -0.25090145846708967, Val -0.28549457183520555
Epoch 5001: Train -0.26837266172981833, Val -0.2855501917852139
Epoch 5002: Train -0.2676511240971667, Val -0.2836593769066868
Epoch 5003: Train -0.2476343624651166, Val -0.2838649179652852
Epoch 5004: Train -0.2621573450341501, Val -0.2857474432825462
Epoch 5005: Train -0.2463408655386692, Val -0.28782887005815216
Epoch 5006: Train -0.26428187034917416, Val -0.2907016252501981
Epoch 5007: Train -0.25914812167248935, Val -0.28326478574392777
Epoch 5008: Train -0.24425022480938793, Val -0.2854976476589975
Epoch 5009: Train -0.2663496598099066, Val -0.28687252789379103
Epoch 5010: Train -0.25847571684983217, Val -0.28388610671047354
Epoch 5011: Train -0.2668005485120485, Val -0.2857709116392192
Epoch 5012: Train -0.2628615411206889, Val -0.2880979708855997
Epoch 5013: Train -0.25954300400812513, Val -0.28480846128877324
Epoch 5014: Train -0.26087883143209206, 

In [19]:
yhat = -np.log(yhat.detach().numpy())

In [20]:
nll = np.mean(yhat)
nll

-0.28892763168125907

In [21]:
res = bootstrap((yhat,), np.mean)
rmean = res.bootstrap_distribution.mean()
low = res.confidence_interval.low
high = res.confidence_interval.high
dev1 = high - rmean
dev2 = rmean - low
dev = dev1 if dev1 > dev2 else dev2
rmean, res.standard_error, res.confidence_interval
print(f'NLL: {nll:.4f} RMean: {rmean:.4f} IC: ({low:.4f},{high:.4f}) Notation: {rmean:.4f}$\pm${dev:.4f}')

NLL: -0.2889 RMean: -0.2889 IC: (-0.3963,-0.1802) Notation: -0.2889$\pm$0.1086


# intc-msft

In [22]:
phi = MixExpPhiStochastic()
net = Copula(phi)
optimizer = optim.SGD(net.parameters(), optim_args['lr'], optim_args['momentum'])

In [23]:
data_loader = INTC_MSFT()
train_D = np.array([data_loader.train_x, data_loader.train_y])[:, :, 0]
test_D = np.array([data_loader.validation_x, data_loader.validation_y])[:, :, 0]

In [24]:
train_data = torch.tensor(train_D).requires_grad_(True).T
test_data = torch.tensor(test_D).requires_grad_(True).T

In [25]:
yhat = expt(train_data, test_data, net, optimizer, optim_args, identifier, num_epochs, batch_size, chkpt_freq)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 5000: Train -0.19616744232900304, Val -0.162821181131533
Epoch 5001: Train -0.19739350851135312, Val -0.16339823848468954
Epoch 5002: Train -0.2047163125979206, Val -0.1600855378162465
Epoch 5003: Train -0.20260827619612373, Val -0.15785971080454975
Epoch 5004: Train -0.1940781026195894, Val -0.16249578643706625
Epoch 5005: Train -0.20989358654057272, Val -0.16423223266189765
Epoch 5006: Train -0.20197039086923357, Val -0.16393937288127883
Epoch 5007: Train -0.2015614195072463, Val -0.15977036545133913
Epoch 5008: Train -0.20501969704839956, Val -0.1635874968014292
Epoch 5009: Train -0.2072199033826982, Val -0.16436935876435227
Epoch 5010: Train -0.20217644700475268, Val -0.16159570913820326
Epoch 5011: Train -0.2015053860219735, Val -0.1582722596410761
Epoch 5012: Train -0.19641172729358072, Val -0.16186652434940346
Epoch 5013: Train -0.1970051068898978, Val -0.16319116867532624
Epoch 5014: Train -0.203097608510978

In [26]:
yhat = -np.log(yhat.detach().numpy())

In [27]:
nll = np.mean(yhat)
nll

-0.17141144973203076

In [28]:
res = bootstrap((yhat,), np.mean)
rmean = res.bootstrap_distribution.mean()
low = res.confidence_interval.low
high = res.confidence_interval.high
dev1 = high - rmean
dev2 = rmean - low
dev = dev1 if dev1 > dev2 else dev2
rmean, res.standard_error, res.confidence_interval
print(f'NLL: {nll:.4f} RMean: {rmean:.4f} IC: ({low:.4f},{high:.4f}) Notation: {rmean:.4f}$\pm${dev:.4f}')

NLL: -0.1714 RMean: -0.1718 IC: (-0.2396,-0.0983) Notation: -0.1718$\pm$0.0735


# goog-fb

In [29]:
phi = MixExpPhiStochastic()
net = Copula(phi)
optimizer = optim.SGD(net.parameters(), optim_args['lr'], optim_args['momentum'])

In [30]:
data_loader = GOOG_FB()
train_D = np.array([data_loader.train_x, data_loader.train_y])[:, :, 0]
test_D = np.array([data_loader.validation_x, data_loader.validation_y])[:, :, 0]

In [31]:
train_data = torch.tensor(train_D).requires_grad_(True).T
test_data = torch.tensor(test_D).requires_grad_(True).T

In [32]:
yhat = expt(train_data, test_data, net, optimizer, optim_args, identifier, num_epochs, batch_size, chkpt_freq)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 5000: Train -0.7947801457189906, Val -0.7886121212757362
Epoch 5001: Train -0.8096516521916411, Val -0.7688831768156748
Epoch 5002: Train -0.8037560506245688, Val -0.7919630559961686
Epoch 5003: Train -0.7924390347809017, Val -0.8155047824929762
Epoch 5004: Train -0.8051676256488796, Val -0.7988798067122259
Epoch 5005: Train -0.8315049336049911, Val -0.8004106809675752
Epoch 5006: Train -0.8066957100312109, Val -0.7823320551149967
Epoch 5007: Train -0.8508246821889003, Val -0.7533440133866433
Epoch 5008: Train -0.793283534951069, Val -0.766502834385848
Epoch 5009: Train -0.8748532263324241, Val -0.8080894616362824
Epoch 5010: Train -0.8509631376701975, Val -0.7702162055396217
Epoch 5011: Train -0.8016958102055606, Val -0.7738969933188448
Epoch 5012: Train -0.8183445965525211, Val -0.7857023583827484
Epoch 5013: Train -0.870522968772352, Val -0.770904213298749
Epoch 5014: Train -0.8082559853875851, Val -0.79026686273

In [33]:
yhat = -np.log(yhat.detach().numpy())

In [34]:
nll = np.mean(yhat)
nll

-0.7488042571832529

In [35]:
res = bootstrap((yhat,), np.mean)
rmean = res.bootstrap_distribution.mean()
low = res.confidence_interval.low
high = res.confidence_interval.high
dev1 = high - rmean
dev2 = rmean - low
dev = dev1 if dev1 > dev2 else dev2
rmean, res.standard_error, res.confidence_interval
print(f'NLL: {nll:.4f} RMean: {rmean:.4f} IC: ({low:.4f},{high:.4f}) Notation: {rmean:.4f}$\pm${dev:.4f}')

NLL: -0.7488 RMean: -0.7483 IC: (-0.8411,-0.6481) Notation: -0.7483$\pm$0.1002
