# 0.

In [3]:
from scipy.linalg import cholesky, solve_triangular, LinAlgError
from scipy.linalg import lapack
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pods


data = pods.datasets.olympic_100m_men()
X, Y = data["X"], data["Y"]
X_pred = np.linspace(X[:,0].min() - 30,
                     X[:,0].max() + 30,
                     500).reshape(-1,1)


class RBF:
    def __init__(self, variance=1., lengthscale=0.1):
        self.variance=variance
        self.lengthscale=lengthscale
        # self.r = self._euclidean_distance
        
    def K(self, X, X2=None):
        return self.variance * np.exp(-0.5 * (self._euc_dist(X, X2) / self.lengthscale)**2)
        # return self._euc_dist(X, X2)
        
    def _euc_dist(self, X, X2):
        if X2 is None:
            # print("X2 is None")
            # print(X2)
            Xsq = np.sum(np.square(X),1)
            r2 = -2.*(np.dot(X, X.T)) + (Xsq[:,None] + Xsq[None,:]) 
            r2 = np.clip(r2, 0, np.inf)
            np.fill_diagonal(r2, 0.)
            return np.sqrt(r2)
        else:
            # print(X)
            # print(X2)
            X1sq = np.sum(np.square(X),1)
            X2sq = np.sum(np.square(X2),1)
            r2 = -2.*np.dot(X, X2.T) + (X1sq[:,None] + X2sq[None,:])
            r2 = np.clip(r2, 0, np.inf)
            return np.sqrt(r2)


def generate_non_pd_mat():    
    # Create PD matrix
    A = np.random.randn(20, 100)
    A = A.dot(A.T)
    # Compute Eigdecomp
    vals, vectors = np.linalg.eig(A)
    # Set smallest eigenval to be negative with 5 rounds worth of jitter
    vals[vals.argmin()] = 0
    default_jitter = 1e-6 * np.mean(vals)
    vals[vals.argmin()] = -default_jitter * (10 ** 3.5)
    A_corrupt = (vectors * vals).dot(vectors.T)
    return A_corrupt


def custom_cholesky(A, max_tries=5):
    A = np.ascontiguousarray(A) # パフォーマンス向上 計算結果にも影響
    diag_A = np.diag(A)
    jitter = diag_A.mean() * 1e-6
    num_tries = 0
    
    try:
        L = cholesky(A, lower=True)
        return L
    except LinAlgError:
        num_tries += 1
        
    while num_tries <= max_tries and np.isfinite(jitter):
        try:
            L = cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
            return L
        except LinAlgError:
            jitter *= 10
            num_tries += 1
    
    raise LinAlgError("Matrix is not positive definite, even with jitter.")
    

def symmetrify_matrix(A, upper=False):
    triu = np.triu_indices_from(A,k=1)
    if upper:
        A.T[triu] = A[triu]
    else:
        A[triu] = A.T[triu]
    return A
    

class GPR:
    def __init__(self, X, y, kernel=None, mean_function=None):
        self.X = X
        self.y = y
        
        if kernel is None:
            kernel = RBF()
            self.kernel = kernel

        if mean_function is None:
            self.mean_function = np.zeros(self.X.shape[0]).reshape(-1,1)

        # self.K = self.kernel.K(self.X)
        self.mean = np.zeros(self.X.shape[0])

            
    def plot_sampled_prior(self, size=None):
        plt.figure()

        extension = np.abs(self.X.min() - self.X.max()) * 0.1
        X = np.linspace(self.X.min() - extension,
                        self.X.max() + extension,
                        200).reshape(-1,1)
        K = self.kernel.K(X)
        L = cholesky(K)
        n = X.shape[0]
        samples = np.random.multivariate_normal(np.zeros(n), np.eye(n), size=size) # generate x from N(0, I)

        if size == None:
            sample = L @ samples
            plt.plot(X.ravel(), sample, lw=1, ls='--')
        else:
            samples = [L @ sample for sample in samples] # y = L @ x
            for sample in samples:
                plt.plot(X.ravel(), sample, lw=1, ls='--')

        plt.show()

Acquiring resource: rogers_girolami_data

Details of data: 
Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.

Please cite:
A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146

After downloading the data will take up 21949154 bytes of space.

Data will be stored in /home/onoue/ods_data_cache/rogers_girolami_data.

Do you wish to proceed with the download? [yes/no]


Downloading  http://www.dcs.gla.ac.uk/~srogers/firstcourseml/firstcoursemldata.tar.gz -> /home/onoue/ods_data_cache/rogers_girolami_data/firstcoursemldata.tar.gz
|    Downloading  10.177MB     |
|>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>|
Extracting file.


In [4]:
def inverse_low_tri_mat(A):
    A_inv, _ = lapack.dtrtri(A, lower=True)
    return A_inv

# 1. 

In [123]:
class GPR:
    def __init__(self, X, y, kernel=None, noise=1.):
        self.X = X
        self.y = y
        self.noise = noise
        
        if kernel is None:
            kernel = RBF()
            self.kernel = kernel
            
        self._K = self.kernel(self.X)

    def _inference_posterior(self):
        m = 0 # If other mean functions are required, we need some modification here.
        
        Ky = self._K.copy()
        Ky += (np.eye(Ky.shape[0]) * (self.noise+1e-8)) # unclear why 1e-8 is added. just followed the way of GPy
        
        LW = custom_cholesky(Ky)
        
        alpha = solve_triangular(LW.T, solve_triangular(LW, self.y-m, lower=True))
        
        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=self._K)
        



In [None]:
class Posterior:
    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, K_chol=None):
        self._K_chol = K_chol
        self._K = K
        self._woodbury_chol = woodbury_chol
        self._woodbury_vector = woodbury_vector
        self._woodbury_inv = woodbury_inv
        
        self._mean = None
        self._covariance = None

    @property
    def woodbury_inv(self):
        """
        The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as
        $$
        (K_{xx} + \Sigma_{xx})^{-1}
        \Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance}
        $$
        """
        if self._woodbury_inv is None:
            if self._woodbury_chol is not None:
                self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1)
                self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1)
                symmetrify(self._woodbury_inv)
        return self._woodbury_inv
    
    @property
    def mean(self):
        """
        Posterior mean
        $$
        K_{xx}v
        v := \texttt{Woodbury vector}
        $$
        """
        if self._mean is None:
            self._mean = self._K @ self.woodbury_vector
        return self._mean

    @property
    def covariance(self):
        """
        Posterior covariance
        $$
        K_{xx} - K_{xx}W_{xx}^{-1}K_{xx}
        W_{xx} := \texttt{Woodbury inv}
        $$
        """
        if self._covariance is None:
            # LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
            self._covariance = (
            np.atleast_3d(self._K) - np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K,
                                                  [1, 0]).T).squeeze()
            # self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
        return self._covariance
    
    
    @property
    def K_chol(self):
        """
        Cholesky of the prior covariance K
        """
        if self._K_chol is None:
            self._K_chol = jitchol(self._K)
        return self._K_chol

    def _raw_predict(self, kern, Xnew, pred_var, full_cov=False):
        woodbury_vector = self.woodbury_vector
        woodbury_inv = self.woodbury_inv

        Kx = kern.K(pred_var, Xnew)
        mu = Kx.T @ woodbury_vector
        if len(mu.shape) == 1:
            mu = mu.reshape(-1, 1)
        Kxx = kern.K(Xnew)
        var = Kxx - Kx.T @ woodbury_inv @ Kx

        return mu, var

In [5]:
import scipy

In [7]:
pd.DataFrame(np.random.rand(3, 3))

Unnamed: 0,0,1,2
0,0.031719,0.879765,0.773689
1,0.672001,0.743656,0.836873
2,0.320842,0.906843,0.181856


In [None]:
    def plot_sampled_prior(self, size=None):
        plt.figure()

        extension = np.abs(self.X.min() - self.X.max()) * 0.1
        X = np.linspace(self.X.min() - extension,
                        self.X.max() + extension,
                        200).reshape(-1,1)
        K = self.kernel.K(X)
        L = cholesky(K)
        n = X.shape[0]
        samples = np.random.multivariate_normal(np.zeros(n), np.eye(n), size=size) # generate x from N(0, I)

        if size == None:
            sample = L @ samples
            plt.plot(X.ravel(), sample, lw=1, ls='--')
        else:
            samples = [L @ sample for sample in samples] # y = L @ x
            for sample in samples:
                plt.plot(X.ravel(), sample, lw=1, ls='--')

        plt.show()

In [None]:
class FITC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.

    The function self.inference returns a Posterior object, which summarizes
    the posterior.

    """
    const_jitter = 1e-6

    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"

        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape

        #make sure the noise is not hetero
        sigma_n = likelihood.gaussian_variance(Y_metadata)
        if sigma_n.size >1:
            raise NotImplementedError("no hetero noise with this implementation of FITC")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm

        #factor Kmm
        diag.add(Kmm, self.const_jitter)
        Kmmi, L, Li, _ = pdinv(Kmm)

        #compute beta_star, the effective noise precision
        LiUT = np.dot(Li, U.T)
        sigma_star = Knn + sigma_n - np.sum(np.square(LiUT),0)
        beta_star = 1./sigma_star

        # Compute and factor A
        A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing)
        LA = jitchol(A)

        # back substutue to get b, P, v
        URiy = np.dot(U.T*beta_star,Y)
        tmp, _ = dtrtrs(L, URiy, lower=1)
        b, _ = dtrtrs(LA, tmp, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)

        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*output_dim*np.sum(np.log(beta_star)) + \
                       -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \
                       0.5*np.sum(np.square(b))
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1))*beta_star**2


        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1,1)) + P
        dL_dK = 0.5*(Kmmi - vvT_P)
        KiU = np.dot(Kmmi, U.T)
        dL_dK += np.dot(KiU*dL_dR, KiU.T)

        # Compute dL_dU
        vY = np.dot(v.reshape(-1,1),Y.T)
        dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU *= beta_star
        dL_dU -= 2.*KiU*dL_dR

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}

        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)

        return post, log_marginal, grad_dic

In [122]:
import GPy
m = GPy.models.GPRegression(X, Y)
m

GP_regression.,value,constraints,priors
rbf.variance,1.0,+ve,
rbf.lengthscale,1.0,+ve,
Gaussian_noise.variance,1.0,+ve,
