# Libraries

In [36]:
import GPflow
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import tensorflow as tf
%matplotlib inline

import time
import copy
import json
import pandas as pd

# R magic
import rpy2

# the following lines will allow us to convert between Pandas DataFrames and R DataFrames
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import ri2py

# this loads the R magic extension
%load_ext rpy2.ipython

### Import CSV

In [2]:
data_prior = np.genfromtxt(fname = "data/for_composititional_analysis_prior.csv", 
                     delimiter = ',',
                     usecols = (1,2,3,4,5),
                     skip_header = 1,
                     dtype=None)

In [3]:
data_posterior =  np.genfromtxt(fname = "data/for_composititional_analysis_posterior.csv", 
                                 delimiter = ',',
                                 usecols = (1,2,3,4,5,6),
                                 skip_header = 1,
                                 dtype=None)

In [4]:
# Transformation of array to matrix
def array_to_matrix(x):
    X = []
    for i in range(len(x)):
        X.append([float(x[i])])
    X = np.array(X)
    return X

# Gaussian Processes
Docs:
- [GP Regression](http://gpflow.readthedocs.io/en/latest/notebooks/regression.html)

In [5]:
def compute(X, Y, kernel_name):
    # Get kernel
    kernel = get_new_kernel(kernel_name)
    
    model = GPflow.gpr.GPR(X, Y, kern = kernel)
    
    try:
        model.optimize()
    except:
        # Add white kernel
        w = GPflow.kernels.White(1, variance = 0.05)
        w.variance.fixed = True
        model = GPflow.gpr.GPR(X, Y, kern = kernel + w)
        
        try:
            print('Adding White Kernel to', kernel_name)
            model.optimize()
        except:
            print('Exception caught computing', kernel_name)
        
    return model

In [6]:
def lml(model):
    """Log marginal likelihood of a GP"""
    
    try:
        return model.compute_log_likelihood()
    except:
        print('Exception caught in lml')
        return -999999999

In [7]:
def predict(gps, X):
    predictions = {}
    
    # For every GP, build predictions
    for key in gps.keys():
        
        try:
            mean, var = gps[key].predict_y(X)
        except:
            print('Exception caught in predict')
            mean, var = np.array([0]), np.array([0])
        
        predictions[key] = {'mean': mean.tolist(), 
                            'var': var.tolist()}
        
    return predictions

In [8]:
def get_new_kernel(kernel_string):
    # Initial new non-optimized kernels
    l = GPflow.kernels.Linear(1)
    p = GPflow.kernels.PeriodicKernel(1)
    r = GPflow.kernels.RBF(1)
    
    if   kernel_string == 'l': return l
    elif kernel_string == 'p': return p
    elif kernel_string == 'r': return r

    elif kernel_string == 'l+r': return  l+r
    elif kernel_string == 'l+p': return  l+p
    elif kernel_string == 'p+r': return  p+r

    elif kernel_string == 'l*r': return  l*r
    elif kernel_string == 'l*p': return  l*p
    elif kernel_string == 'p*r': return  p*r

    elif kernel_string == 'l+r+p': return l+r+p
    elif kernel_string == 'l+r*p': return l+r*p
    elif kernel_string == 'l*r+p': return l*r+p
    elif kernel_string == 'l*p+r': return l*p+r
    elif kernel_string == 'l*r*p': return l*r*p
    
    else: return 'error'

In [9]:
def normalize(Y):
    std = np.std(Y)
    mu = np.mean(Y)
    
    return ((Y - mu)/std)
    

In [10]:
def compute_gps(X, Y0):
    #Y = normalize(Y0)
    Y = Y0
    
    gps = {}

    gps['l'] = compute(X, Y, 'l')
    gps['p'] = compute(X, Y, 'p')
    gps['r'] = compute(X, Y, 'r')

    gps['l+r'] = compute(X, Y, 'l+r')
    gps['l+p'] = compute(X, Y, 'l+p')
    gps['p+r'] = compute(X, Y, 'p+r')

    gps['l*r'] = compute(X, Y, 'l*r')
    gps['l*p'] = compute(X, Y, 'l*p')
    gps['p*r'] = compute(X, Y, 'p*r')

    gps['l+r+p'] = compute(X, Y, 'l+r+p')
    gps['l+r*p'] = compute(X, Y, 'l+r*p')
    gps['l*r+p'] = compute(X, Y, 'l*r+p')
    gps['l*p+r'] = compute(X, Y, 'l*p+r')
    gps['l*r*p'] = compute(X, Y, 'l*r*p')
    
    return gps

In [11]:
def compute_lmls(models):
    lmls = {}
    for key in models.keys():
        lmls[key] = lml(models[key])
        
    return lmls

In [12]:
def gps_to_string(gps):
    strings = {}
    for key in gps.keys():
        strings[key] = str(gps[key])
        
    return strings

In [13]:
def dict_max(d):
    maxval = max(d.values())
    keys = [k for k,v in d.items() if v==maxval]
    return keys, maxval

In [14]:
def save_results(results, filename):
    with open('output/' + filename + '.json', 'w') as fp:
        json.dump(results, fp)

In [15]:
Xpredictions = np.linspace(31, 365*4, int(365*4-31+1))[:,None]

Compute Gaussian Process Models for a dataset

In [16]:
def compute_gps_for_dataset(dataset, Xpredictions=Xpredictions):
    t0 = time.time()
    
    ids = np.unique(dataset['f0'])
    
    gpss_objects = {}
    gpss = {}
    predictions = {}
    lmls = {}
    maxs = {}
    
    for i in ids:
        print(i)
        # Filter the relevant data
        filtered_data = dataset[dataset['f0'] == i]
        
        # Get X and Y
        X = array_to_matrix(filtered_data['f3'])
        Y = array_to_matrix(filtered_data['f4'])
        
        # Compute GPs
        gps = compute_gps(X, Y)
        print('Compute OK')
        
        # Calculate the predictions of the GP given the initial data
        
        # Find the best fitting GP
        likelihoods = compute_lmls(gps)
        best = dict_max(likelihoods)
        print('LMLs OK')
        
        # Make predictions
        gps_predictions = predict(gps, Xpredictions)
        print('Predictions OK')
        
        # Save
        i = str(i)
        #gpss_objects[i] = gps
        gpss[i] = gps_to_string(gps) # The GP parameters
        predictions[i] = gps_predictions
        lmls[i] = likelihoods
        maxs[i] = best
        

    print('Minutes:', str(round((time.time() - t0) / 60)))
        
    return {
            'gpss_objects': gpss_objects, #Actual objects
            'gpss': gpss, 
            'Xpredictions': Xpredictions.tolist(), 
            'predictions': predictions, 
            'lmls': lmls, 
            'maxs': maxs
           }

In [17]:
def plot(X, Y, mean, var):
    xx = Xpredictions
    plt.clf()
    plt.figure(figsize=(12, 6))
    plt.plot(X, Y, 'kx', mew=2)
    plt.plot(xx, mean, 'b', lw=2)
    plt.fill_between(xx[:,0], mean[:,0] - 2*np.sqrt(var[:,0]), mean[:,0] + 2*np.sqrt(var[:,0]), color='blue', alpha=0.2)
    plt.xlim(31, 365*4)
    #plt.ylim(-2, 2)

In [18]:
def plot_predictions(results, data, target_id, target_kernel):

    dat = data[data['f0'] == target_id]

    X = array_to_matrix(dat['f3'])
    Y = normalize(array_to_matrix(dat['f4']))

    mean = np.array(results['predictions'][str(target_id)][target_kernel]['mean'])
    var = np.array(results['predictions'][str(target_id)][target_kernel]['var'])

    plot(X, Y, mean, var)

# Compute Gaussian Processes

In [19]:
def debug_filtering(dataset):
    #dataset = dataset[dataset['f0'] == 59]
    #dataset = dataset[dataset['f0'] > 10]
    
    return dataset

In [20]:
def scenario(dataset, scenario):
    return dataset[dataset['f2'] == scenario]

In [21]:
np.unique(data_prior['f2'])

array([b'"FB Friends"', b'"Gym members"', b'"Rain"', b'"Salary"',
       b'"Sales"', b'"Temperature"'], 
      dtype='|S13')

### Prior condition

In [22]:
#dataset = debug_filtering(data_prior)
#results_prior = compute_gps_for_dataset(dataset)
#save_results(results_prior, 'results_prior')

### Posterior condition (only evidence)

In [23]:
#dataset = debug_filtering(data_posterior)

# Only the evidence
#dataset = dataset[dataset['f3'] < (365-31+1)]

#results_posterior = compute_gps_for_dataset(dataset)
#save_results(results_posterior, 'results_posterior')

# Plots

In [None]:
plot_predictions(results_posterior, data_posterior, 8, 'l+p')

In [None]:
plot_predictions(results_prior, data_prior, 9, 'l+r*p')

# Results analysis

R libraries

In [56]:
%%R
source("tools.R")

Import the results

In [24]:
with open('output/results_prior.json', 'r') as fp:
    results_prior = json.load(fp)
    
with open('output/results_posterior.json', 'r') as fp:
    results_posterior = json.load(fp)

### Proportions of best fitting kernel composition in the Prior condition, per scenario

In [48]:
maxs_prior = pd.Series(results_prior['maxs'])

In [68]:
%R -i maxs_prior



SyntaxError: invalid syntax (<ipython-input-68-904d31d9d068>, line 3)

In [70]:
%%R 
a<-c(1,2,3)

In [72]:
%R a

array([ 1.,  2.,  3.])

# DEBUG

In [None]:
results_prior['maxs']

In [None]:
data['maxs']