In [3]:
from torch import nn
import torch
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import collections
from torch.distributions.multivariate_normal import MultivariateNormal
import matplotlib.pyplot as plt
from datageneration import DataGenerator

In [2]:
def scale_shift_uniform(a=0,b=1,*size):
    return torch.rand(size=(size))*(a-b)+b

In [8]:
CNPRegressionDescription = collections.namedtuple(
    'CNPRegressionDescription',
    ('query',
     'target_y',
     'num_total_points',
     'num_context_points'
    ))

## Workflow of the implementation

- CNPS as Gaussian Processes try to learn a distribution over the functional value vector $V=(f(x_1).....f(x_n))$
- At test time any function from this distribution can be approximated
- The function will take into consideration the context points that have beeen give to make one function from this distribution more likely than others.

### Data Generation:
- The training points come from various functions that share some common characteristic
- In this implemenation the different functions come from __one__ Gaussian process
- A GP is a multivariate normal distribution, aka a mean and covariance matrix, where each dimension of the infinite random vector is a, aka random variable, is the functional value for a given input value.


1. The Kernel - __Creates a covariance matrix__: A function that takes in x values and returns a co-variance matrix. Here the Gaussian Kernel, RBF, or squared exponential is used. It computes the differences between all one dimensional feature vectors scales this distance by ``l``, squares it and scales it again by ``sigma_f``. As seen in other implementations (<a href = https://www.inf.ed.ac.uk/teaching/courses/mlpr/2019/notes/gp_minimal.py> GP demo</a>) some noises is added to the diagonal or to the variance of the covariance matrix to ensure a positive definite matrix and thus allow the Cholesky decomposition to be defined.
<br><br>
2. Curve Generator - __Generates functions from a GP__: We first set ``num_context_points`` <br>
__Training__: The number of target points is a random share of the context points. Random ``x_values`` are generated from a ``uniform`` between `-2` and `2`. Each of the ``batch_size`` vectors of context points is 1 by ``num_context_points``.<br>
__Testing__:For testing more ``targets`` (400) are created and are simply set at ``0.001`` intervals between `-2` and `2`.  <br><br>
The Kernel scale parameters are set and the ``x_values`` are past through the ``kernel`` to create the covariance matrix. The covariance matrix is decomposed with the ``cholesky``decomposition. The ``y_values`` are created through the following process:
Given the standardization of a non-standard multivariate normal $Z=\frac{X-\mu}{\sigma}$ we can create the non-standard multivariate normal by $L^-1*Z+\mu=Z$ where $L^-1$ is the Cholesky.

Finally, depending on training/ testing the appropriate number of points are selected from the ``x_values`` and the ``y_values``.

### Training, predicting with GPs:

GPs during training build the posterior distribution conditioned on the observed data.
At test time this distribution serves as the prior which will then be updated using the Bayes rule. Drawing from this distribution or taking the expected value is the prediction.


## Covariance Function for Gaussian Process:
- The Covariance function encodes our believes about the function to be generated/ learned 
- It encodes notion of similarity, i.e. similar inputs produce similar outputs
- Different factors influence the choice of the covariance function
- __Stationary covariance__ function is invariant to translations on the input space
 - I guess this it is constant when the inputs are transformed 
- __Isotropy or isotropic__ covariance functions are functions only of $|X_1-X_2|$, i.e. $K(X_1,X_2)$.
- The function only depends on the distance between the two input vectors
- This implementation uses the squared expnential, aka Radial Basis Function aka Gaussian Kernel

In [9]:
## Training 

## Gaussian Process Curve Sampler

In [85]:
class GPCurvesREader(object):
    def __init__(self,
                batch_size,
                max_num_context,
                x_size=1,
                y_size=1,
                l1_scale=1,
                sigma_scale=0.5,
                testing=False):

        self._batch_size = batch_size 
        #batch_size seems to be the number of batches
        #input is a (batch_size,rows,col, i.e. 64 batches with 10, 1D datapoints)
        self._max_num_context = max_num_context
        self._x_size = x_size 
        self._y_size = y_size
        self._l1_scale = l1_scale
        self._sigma_scale = sigma_scale
        self._testing = testing
    
    def _gaussian_kernel(self, xdata, l1, sigma_f, sigma_noise=2e-2):
        """Generates curves from a Gaussian Process
            GP is a distribution over a vector of functional values
        
        Args:
            xdata: Tensor with shape `[batch_size, num_total_points, x_size]` with
          the values of the x-axis data.
      l1: Tensor with shape `[batch_size, y_size, x_size]`, the scale
          parameter of the Gaussian kernel.
      sigma_f: Float tensor with shape `[batch_size, y_size]`; the magnitude
          of the std.
      sigma_noise: Float, std of the noise that we add for stability.

    Returns:
      The kernel, a float tensor with shape
      `[batch_size, y_size, num_total_points, num_total_points]`.
    """
        
#       number of points per batch
        num_total_points = xdata.shape[1]
        
        
        
        xdata1 = torch.unsqueeze(xdata, axis=1)
        xdata2 = torch.unsqueeze(xdata, axis=2)
#       takes the pairwise difference between for each of the data points in each batch
        diff = xdata1- xdata2
#       the squared exponential is defined as the sum of the squared differences scaled by a factor gamma
#       l1 is simply the length parameter of the Kernel
        norm = (diff[:, None, :, :, :] / l1[:, :, None, None, :]).pow(2)
        
        norm = norm.squeeze(-1)
#       we now have a GP or a description of a multivariate normal distribution
#       from which we can generate functions
        kernel = sigma_f.pow(2)[:, :, None, None]*torch.exp(-0.5*norm)
        
        kernel += (sigma_noise**2)*torch.eye(num_total_points)
        
        return kernel
        
    def generate_curves(self):
        
        """context points are the points used for training - chosen at random
        
        """
        
        num_context = np.random.randint(low=3,
                                    high=self._max_num_context,
                                    dtype=np.int32)
        if self._testing:
#           we are going to generating 400 points to predict
            num_target = 400
            num_total_points = num_target
#           the x-axis is a tensor of depth batch_size 
#           num_target lentgh and width of 1  
            x_values = torch.arange(-2.,2.,1./100, dtype=torch.float32).repeat(1,64)
            x_values = torch.unsqueeze(-1)
        
        else:
#           for training only max_num_context points are generated
#           
            num_target = np.random.randint(low=2,
                                       high=self._max_num_context,
                                       dtype=np.int32)
    
            num_total_points = num_context + num_target
            x_values = scale_shift_uniform(-2,2,
                                           self._batch_size,
                                           num_total_points,
                                           self._x_size)
#       setting gamma and sigma        
        l1 = torch.ones(size=(self._batch_size,
                       self._x_size,
                       self._y_size))*self._l1_scale
        sigma_f = torch.ones(size=(self._batch_size,
                                   self._y_size))*self._sigma_scale
        
        kernel = self._gaussian_kernel(x_values, l1, sigma_f)
        
        cholesky = torch.cholesky(kernel.double()).float()
        
        standard_normals = torch.randn(
            size=(
                self._batch_size,
                self._y_size,
                num_total_points,1))
        
        y_values = torch.matmul(cholesky,standard_normals)
        y_values = torch.transpose(y_values.squeeze(3),2,1)
        
        
        if self._testing:
            # Select the targets
            target_x = x_values
            target_y = y_values
            idx = torch.randperm(num_target)
            x_values.shape
            context_x = x_values[:,idx[:num_context],:]
            context_y = y_values[:,idx[:num_context],:]
            
        else:
#             if not we have values that serve as context and targets
            target_x = x_values[:,:num_target + num_context:]
            target_y = y_values[:,:num_target + num_context:]
#             select target points
            context_x = x_values[:, :num_context, :]
            context_y = y_values[:, :num_context, :]
        
#         query = ((context_x, context_y), target_x)
        
#         return CNPRegressionDescription(
#             query=query,
#             target_y=target_y,
#             num_total_points=target_x.shape[1],
#             num_context_points=num_context)
        data_dict = {
            'context_x':context_x,
            'context_y':context_y,
            'target_x':target_x,
            'target_y':target_y,
        }
        return data_dict

In [None]:
_gaussian_kernel

## Encoder

The encoder creates a representation of the data context data. Some technicalilites to understand are the following: 
- The encoder simply takes a concatenation of the x,y values
- x values are of shape batch_size*number of context points, dimx
- y values are of shape batch_size*number of context points, 1
- the represenation ri can be of the dimensions that we choose.


In [14]:
class Encoder(nn.Module):
    
    
    """This class maps each x_i, y_i context point to a representation r_i
    To learn this Representation we are using a Multi Layer Perceptron
    The input shape will be batch_size, num_context_points, x_dim
    
    The input to the encoder are the value pairs, thus the dimensions are 
    Batch_Size, (dimx+dimy). The Pytorch automatically pases the values sequentially
    through the ANN.
    The last layer will not have an activation function because we want the pure represenation.
    
    Parameters
    ----------
    
    dimx : int
        Dimesion of each x value
    
    dimy : int
        Dimesion of each y value
        
    dimr : int
        Dimension of output representation
    
    dimh : tuple
        Dimension of hidden layers
    
    """
    
    
    def __init__(self, dimx, dimy, dimr, *args):
        super().__init__()
            
        self._dimx = dimx
        self._dimy = dimy
        self._dimr = dimr
        self._dimh = args
        
        
        _first_layer = [nn.Linear(self._dimx+self._dimy, self._dimh[0]),nn.ReLU()]
        
        
        _hidden_layers = list(np.array([
            [nn.Linear(self._dimh[i], self._dimh[i+1]),nn.ReLU()]
            for i in range(len(self._dimh)-2)
        ]).flatten())
        
        _last_layer = [nn.Linear(self._dimh[-2], self._dimh[-1])]
        
        self._layers = _first_layer + _hidden_layers + _last_layer
        
        
        self._process_input = nn.Sequential(*self._layers)
        
    def forward(self, x_values, y_values):
        """
        Takes the context points x and y,
        concatenates them into value pairs
        and passes them through the MLP
        
        Parameters
        ----------
        
        x_values : torch.Tensor 
            Shape (batch_size, dimx)
            
        y_values : torch.Tensor 
            Shape (batch_size, dimy)
        
        """
        
        input_as_pairs = torch.cat((x_values, y_values), dim=1)

        
        return self._process_input(input_as_pairs)
        

## Aggregator

- The aggregator simply creates an aggregation'
- Here we simply take the average of ri

In [15]:
def aggregate(ri_tensor):
    """Takes a tensor of shape (batch_size,num_context_points, dimr) and aggregates it a
    along the second axis so that we have an aggregation across each batch
    
    Parameters
    ----------
    
    ri_tensor : Tensor
        Tensor of the representation of the x and y context points
    """
    
    return ri_tensor.mean(dim=1)
    

## Decoder

- The decoder takes the context points and the representation, passes them through an MLP and returns an output of dimensions two
- These two are used to minimize the negative log conditinal ligelihood which is a function that depends on the mu and sigma
- By minimize this quantitiy we find the correct parameters to the distribution of the Gaussian process from which the context points were drawn.
- We need to get 

In [130]:
class Decoder(nn.Module):
    
    """The decoder takes in x_values, that is the target points and combines them with
    the represenation of the context points by concatenation. The resulting tensor is passed to an MLP that 
    is asked to ouput the parameters for the sought after distribution, in this case
    a normal distribution. Thus we are looking for two parameters. The MLP returns two tensor obejects
    which hold a mean/ variance for each point y. Thus the shape of this output is 
    batch_size,y_values,y_dim, 2
    
    Note the targets consist
    of both the context points as well as the target points, since the context points
    are a subset of the target points.
    
    
    Parameters
    ----------
        
    dimx : int
        Dimension of each x value
    
    dimr : int
        Dimension of each of the representations
    
    *args : tuple
        Dimensions of the hidden layers 
             
    """
    
    def __init__(self,dimx, dimr,dimparam,*args):
        super().__init__()
        
        
        self._dimx = dimx
        self._dimr = dimr
        self._dimparam = dimparam
        self._dimh = args
        
        _first_layer = [nn.Linear(self._dimx+self._dimr, self._dimh[0]),nn.ReLU()]
        
        
        _hidden_layers = list(np.array([
            [nn.Linear(self._dimh[i], self._dimh[i+1]),nn.ReLU()]
            for i in range(len(self._dimh)-1)
        ]).flatten())
        
        _last_layer = [nn.Linear(self._dimh[-1], self._dimparam)]
        
        self._layers = _first_layer + _hidden_layers + _last_layer
        
        
        self._process_input = nn.Sequential(*self._layers)
        
    def forward(self, x_values,r_values):
        
        """Takes x and r values, combines them and passes them twice to MLP. 
        Thus we have one run for mu and one run for sigma"""
        
        
        input_as_pairs = torch.cat((x_values, r_values),dim=1)
        
        dist_params = self._process_input(input_as_pairs)
        return self._process_input(input_as_pairs)     

In [189]:
def calc_loss (mu,sigma, targets):
    """Takes mean and variance arranges them in the appropriate shapes and creates a multivariate normal
        
    Parameters
    ----------
        
    mu : Tensor
        (Batch_size, num_target) dimensional vector with mean estimate for each target
    
    sigma : Tensor
        (Batch_size, num_target) dimensional vector holding the variance estimate for each target
            
    targets : Tensor 
        (batch_size,num_targets, ydim) dimensional vector with the targets
        
        """
#   padding the variance vector with zeros of the diagonal  
    cov_matrix = torch.diag_embed(sigma)
    distribution = MultivariateNormal(loc=mu,covariance_matrix=cov_matrix)
    loss = distribution.log_prob(targets.squeeze(-1))
        
    return distribution,torch.mean(loss)

 ## Sandbox - testing the individual parts of the function

### Kernel function
Expanding the dimesions of the tensors allows us substract each point pairwise
This being a 1-D use case we simply substract each of the data points and get an n by n matrix for each layer.

In [17]:
TRAINING_ITERATIONS = int(2e5)
MAX_CONTEXT_POINTS = 10
PLOT_AFTER = int(2e4)

In [18]:
dataset_train = GPCurvesREader(
    batch_size=64, 
    max_num_context=MAX_CONTEXT_POINTS)
data_train = dataset_train.generate_curves()

torch.Size([64, 1, 11, 1])


In [170]:
batch_size = 64
num_points = 10
dimx = 1
dimy = 1
rdim = 20
hdim = 20
x = scale_shift_uniform(-2,2,batch_size,num_points,dimx)
y = scale_shift_uniform(-2,2,batch_size,num_points,dimy)

x_stacked  = x.view(batch_size*num_points,-1)
y_stacked  = y.view(batch_size*num_points,-1)

encoder = Encoder(1,1,hdim,hdim,hdim,rdim)
r  = encoder.forward(x_stacked,y_stacked).view(batch_size,num_points,-1)
r_aggregate = aggregate(r).unsqueeze(1)
r_aggregate = r_aggregate.repeat(1,num_points,1)

hdim = 128
outdim = 2
decoder = Decoder(dimx,rdim,outdim,hdim,hdim,hdim)

r_stacked = r_aggregate.view(batch_size*num_points,-1)

dist_params = decoder.forward(x_stacked, r_stacked).view(batch_size,num_points,-1)

mu = dist_params[:,:,0]
sigma = torch.diag_embed(dist_params[:,:,1])
sigma_test = torch.ones(10).unsqueeze(0).repeat(64,1)

learned_distribution = decoder.get_distribution(mu,sigma_test)


In [188]:
torch.mean(calc_loss(mu, sigma_test,y)[1])

tensor(-16.0372, grad_fn=<MeanBackward0>)

In [174]:
y.squeeze(-1).shape

torch.Size([64, 10])

In [144]:
mu.shape

torch.Size([64, 10])

In [154]:
dist_params[:,:,0,None].shape

torch.Size([64, 10, 1])

In [1]:
from datasets import SineData

ModuleNotFoundError: No module named 'datasets'