# Libraries

In [112]:
import GPflow
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import tensorflow as tf
%matplotlib inline

import time
import copy
import json

### Import CSV

In [113]:
data_prior = np.genfromtxt(fname = "data/for_composititional_analysis_prior_v.csv", 
                     delimiter = ',',
                     usecols = (1,2,3,4,5),
                     skip_header = 1,
                     dtype=None)

In [114]:
data_posterior =  np.genfromtxt(fname = "data/for_composititional_analysis_posterior.csv", 
                                 delimiter = ',',
                                 usecols = (1,2,3,4,5,6),
                                 skip_header = 1,
                                 dtype=None)

In [115]:
# Transformation of array to matrix
def array_to_matrix(x):
    X = []
    for i in range(len(x)):
        X.append([float(x[i])])
    X = np.array(X)
    return X

# Gaussian Processes
Docs:
- [GP Regression](http://gpflow.readthedocs.io/en/latest/notebooks/regression.html)

In [137]:
def compute(X, Y, kernel_name):
    # Get kernel
    kernel = get_new_kernel(kernel_name)
    
    model = GPflow.gpr.GPR(X, Y, kern = kernel)
    
    try:
        model.optimize()
    except:
        # Add white kernel
        w = GPflow.kernels.White(1, variance = 0.05)
        w.variance.fixed = True
        model = GPflow.gpr.GPR(X, Y, kern = kernel + w)
        
        try:
            print('Adding White Kernel to', kernel_name)
            model.optimize()
        except:
            print('Exception caught computing', kernel_name)
        
    return model

In [117]:
def lml(model):
    """Log marginal likelihood of a GP"""
    
    try:
        return model.compute_log_likelihood()
    except:
        print('Exception caught in lml')
        return -999999999

In [118]:
def get_new_kernel(kernel_string):
    # Initial new non-optimized kernels
    l = GPflow.kernels.Linear(1)
    p = GPflow.kernels.PeriodicKernel(1)
    r = GPflow.kernels.RBF(1)
    
    if   kernel_string == 'l': return l
    elif kernel_string == 'p': return p
    elif kernel_string == 'r': return r

    elif kernel_string == 'l+r': return  l+r
    elif kernel_string == 'l+p': return  l+p
    elif kernel_string == 'p+r': return  p+r

    elif kernel_string == 'l*r': return  l*r
    elif kernel_string == 'l*p': return  l*p
    elif kernel_string == 'p*r': return  p*r

    elif kernel_string == 'l+r+p': return l+r+p
    elif kernel_string == 'l+r*p': return l+r*p
    elif kernel_string == 'l*r+p': return l*r+p
    elif kernel_string == 'l*p+r': return l*p+r
    elif kernel_string == 'l*r*p': return l*r*p
    
    else: return 'error'

In [170]:
def normalize(Y):
    std = np.std(Y)
    mu = np.mean(Y)
    
    return ((Y - mu)/std)
    

In [171]:
def compute_gps(X, Y0):
    Y = normalize(Y0)
    
    gps = {}

    gps['l'] = compute(X, Y, 'l')
    gps['p'] = compute(X, Y, 'p')
    gps['r'] = compute(X, Y, 'r')

    gps['l+r'] = compute(X, Y, 'l+r')
    gps['l+p'] = compute(X, Y, 'l+p')
    gps['p+r'] = compute(X, Y, 'p+r')

    gps['l*r'] = compute(X, Y, 'l*r')
    gps['l*p'] = compute(X, Y, 'l*p')
    gps['p*r'] = compute(X, Y, 'p*r')

    gps['l+r+p'] = compute(X, Y, 'l+r+p')
    gps['l+r*p'] = compute(X, Y, 'l+r*p')
    gps['l*r+p'] = compute(X, Y, 'l*r+p')
    gps['l*p+r'] = compute(X, Y, 'l*p+r')
    gps['l*r*p'] = compute(X, Y, 'l*r*p')
    
    return gps

In [121]:
def compute_lmls(models):
    lmls = {}
    for key in models.keys():
        lmls[key] = lml(models[key])
        
    return lmls

In [122]:
def gps_to_string(gps):
    strings = {}
    for key in gps.keys():
        strings[key] = str(gps[key])
        
    return strings

In [123]:
def dict_max(d):
    maxval = max(d.values())
    keys = [k for k,v in d.items() if v==maxval]
    return keys, maxval

In [124]:
def save_results(results, filename):
    with open('output/' + filename + '.json', 'w') as fp:
        json.dump(results, fp)

In [125]:
def predict(gps, X):
    predictions = {}
    
    # For every GP, build predictions
    for key in gps.keys():
        mean, var = gps[key].predict_y(X)
        
        predictions[key] = {'mean': mean.tolist(), 
                            'var': var.tolist()}
        
    return predictions

In [126]:
Xpredictions = np.linspace(31, 365*4, int(365*4-31+1))[:,None]

Compute Gaussian Process Models for a dataset

In [127]:
def compute_gps_for_dataset(dataset, Xpredictions=Xpredictions):
    
    ids = np.unique(dataset['f0'])
    
    gpss_objects = {}
    gpss = {}
    predictions = {}
    lmls = {}
    maxs = {}
    
    for i in ids:
        print(i)
        # Filter the relevant data
        filtered_data = dataset[dataset['f0'] == i]
        
        # Get X and Y
        X = array_to_matrix(filtered_data['f3'])
        Y = array_to_matrix(filtered_data['f4'])
        
        # Compute GPs
        gps = compute_gps(X, Y)
        print('Compute OK')
        
        # Calculate the predictions of the GP given the initial data
        
        # Find the best fitting GP
        likelihoods = compute_lmls(gps)
        best = dict_max(likelihoods)
        print('LMLs OK')
        
        # Make predictions
        gps_predictions = predict(gps, Xpredictions)
        print('Predictions OK')
        
        # Save
        i = str(i)
        gpss_objects[i] = gps
        gpss[i] = gps_to_string(gps) # The GP parameters
        predictions[i] = gps_predictions
        lmls[i] = likelihoods
        maxs[i] = best
        
    return {
            #'gpss_objects': gpss_objects, #Actual objects
            'gpss': gpss, 
            'Xpredictions': Xpredictions.tolist(), 
            'predictions': predictions, 
            'lmls': lmls, 
            'maxs': maxs
           }

In [167]:
def plot(X, Y, mean, var):
    xx = Xpredictions
    plt.clf()
    plt.figure(figsize=(12, 6))
    plt.plot(X, Y, 'kx', mew=2)
    plt.plot(xx, mean, 'b', lw=2)
    plt.fill_between(xx[:,0], mean[:,0] - 2*np.sqrt(var[:,0]), mean[:,0] + 2*np.sqrt(var[:,0]), color='blue', alpha=0.2)
    plt.xlim(31, 365*4)
    #plt.ylim(-2, 2)

In [175]:
def plot_predictions(results, data, target_id, target_kernel):

    dat = data[data['f0'] == target_id]

    X = array_to_matrix(dat['f3'])
    Y = normalize(array_to_matrix(dat['f4']))

    mean = np.array(results['predictions'][str(target_id)][target_kernel]['mean'])
    var = np.array(results['predictions'][str(target_id)][target_kernel]['var'])

    plot(X, Y, mean, var)

# Compute Gaussian Processes

### Prior condition

In [142]:
dataset = data_prior

dataset = dataset[dataset['f0'] < 11]
dataset = dataset[dataset['f0'] > 7]

results_prior = compute_gps_for_dataset(dataset)

8
Adding White Kernel to l+r*p
Adding White Kernel to l*p+r
Compute OK
LMLs OK
Predictions OK
9
Adding White Kernel to l+r+p
Adding White Kernel to l+r*p
Adding White Kernel to l*p+r
Compute OK
LMLs OK
Predictions OK
10
Adding White Kernel to p*r
Adding White Kernel to l+r+p
Adding White Kernel to l+r*p
Compute OK
LMLs OK
Predictions OK


In [None]:
save_results(results_prior, 'results')

### Posterior condition

# Plots

In [None]:
# Get the data
temperature = data[data['f2'] == b'"Temperature"']
rain = data[data['f2'] == b'"Rain"']
sales = data[data['f2'] == b'"Sales"']
salary = data[data['f2'] == b'"Salary"']
gym = data[data['f2'] == b'"Gym members"']
fb = data[data['f2'] == b'"FB Friends"']

t0 = time.time()

# Temperature
prior_temperature = compute_gps_for_dataset(temperature)

# Rain
prior_rain = compute_gps_for_dataset(rain)

# Sales
prior_sales = compute_gps_for_dataset(sales)

# Salary
prior_salary = compute_gps_for_dataset(salary)

# Gym
prior_gym = compute_gps_for_dataset(gym)

# Facebook
prior_fb = compute_gps_for_dataset(fb)

t1 = time.time()
(t1-t0)/60

In [None]:
# Save
save_results(prior_temperature, 'prior_temperature')
save_results(prior_rain, 'prior_rain')
save_results(prior_sales, 'prior_sales')
save_results(prior_salary, 'prior_salary')
save_results(prior_gym, 'prior_gym')
save_results(prior_fb, 'prior_fb')

# In development

# DEBUG

In [None]:
def plot(m, X, Y):
    xx = np.linspace(31, 365*4, int(365*4-31+1))[:,None]
    mean, var = m.predict_y(xx)
    plt.clf()
    plt.figure(figsize=(12, 6))
    plt.plot(X, Y, 'kx', mew=2)
    plt.plot(xx, mean, 'b', lw=2)
    plt.fill_between(xx[:,0], mean[:,0] - 2*np.sqrt(var[:,0]), mean[:,0] + 2*np.sqrt(var[:,0]), color='blue', alpha=0.2)
    plt.xlim(31, 365*4)
    plt.ylim(-2, 2)

In [None]:
curve_id = 1

In [None]:
t0 = time.time()

one_combination = data[data['f0'] < 3]
results = compute_gps_for_dataset(one_combination)

t1 = time.time()
(t1-t0)/60

In [None]:
results['predictions'].keys()

### Use the same kernel composition

In [None]:
def get_evidence(curve_id):
    dataset = posterior_data

    # Filter the relevant data
    filtered_data = dataset[dataset['f0'] == curve_id]

    # Get X and Y
    X = array_to_matrix(filtered_data['f3'])
    Y = array_to_matrix(filtered_data['f4'])

    #
    Xe = X[1:n] 
    Ye = Y[1:n]

    return (Xe, Ye)

In [None]:
# Get the best fitting kernel composition (string)
best_kernel_name = results['maxs'][curve_id][0][0]

# Get the evidence for the subject

# Create a new kernel using that composition
Xe, Ye = get_evidence(curve_id)
gp = compute(Xe, Ye, best_kernel_name)

# Fit it to the evidence shown in the Posterior condition



In [None]:
a = str(gp)

In [None]:
a

In [None]:
#Best fitting kernel
k = copy.deepcopy(results['gpss_objects'][subject][results['maxs'][subject][0][0]])

#Specific (any other) kernel
k = copy.deepcopy(results['gpss_objects'][subject]['l+r*p'])

# Use the posterior data
usePosterior = True

if(usePosterior):
    dataset = posterior_data
else:
    dataset = data


# Filter the relevant data
filtered_data = dataset[dataset['f0'] == subject]

# Get X and Y
X = array_to_matrix(filtered_data['f3'])
Y = array_to_matrix(filtered_data['f4'])


if(usePosterior):
    # Evidence
    n = 68
    Xe = X[1:n] 
    Ye = Y[1:n]
    k.X = Xe
    k.Y = Ye
    #k.optimize()
    
    #k.Y = np.concatenate([Ye, k.Y.value[68:]])
    
    
# Plot
plot(k, X, Y)

In [None]:
k

In [None]:
name.likelihood.variance = 0.01

k.optimize()
k

### Prior magic

In [None]:
dataset = posterior_data

# Filter the relevant data
filtered_data = dataset[dataset['f0'] == subject]

# Get X and Y
X = array_to_matrix(filtered_data['f3'])
Y = array_to_matrix(filtered_data['f4'])

In [None]:
n = 68 #68
Xevidence = X[1:n] 
Yevidence = Y[1:n]

In [None]:
# Prior model
m = results['gpss_objects'][subject]['l+p']

# New model
l = GPflow.kernels.Linear(1)
p = GPflow.kernels.PeriodicKernel(1)
r = GPflow.kernels.RBF(1)
m2 = GPflow.gpr.GPR(Xevidence, Yevidence, kern=l+p)

In [None]:
if(False):
    # Transfer the parameters
    m2.kern.linear.variance.prior = GPflow.priors.Gaussian(m.kern.linear.variance.value, m.kern.linear.variance.value)
    
    m2.kern.periodickernel.lengthscales.prior = GPflow.priors.Gaussian(m.kern.periodickernel.lengthscales.value, m.kern.periodickernel.lengthscales.value)
    m2.kern.periodickernel.period.prior = GPflow.priors.Gaussian(m.kern.periodickernel.period.value, m.kern.periodickernel.period.value)
    m2.kern.periodickernel.variance.prior = GPflow.priors.Gaussian(m.kern.periodickernel.variance.value, m.kern.periodickernel.variance.value)
    
    m2.kern.rbf.lengthscales.prior = GPflow.priors.Gaussian(m.kern.rbf.lengthscales.value, m.kern.rbf.lengthscales.value)
    m2.kern.rbf.variance.prior = GPflow.priors.Gaussian(m.kern.rbf.variance.value, m.kern.rbf.variance.value)
    
    m2.likelihood.variance.prior = GPflow.priors.Gaussian(m.likelihood.variance.value, m.likelihood.variance.value)

In [None]:
m

In [None]:
if(True):
    m2.kern.linear.variance = m.kern.linear.variance.value
    
    m2.kern.prod.periodickernel.lengthscales = m.kern.prod.periodickernel.lengthscales.value
    m2.kern.prod.periodickernel.period = m.kern.prod.periodickernel.period.value
    m2.kern.prod.periodickernel.variance = m.kern.prod.periodickernel.variance.value
    
    m2.kern.prod.rbf.lengthscales = m.kern.prod.rbf.lengthscales.value
    m2.kern.prod.rbf.variance = m.kern.prod.rbf.variance.value
    
    m2.likelihood.variance = m.likelihood.variance.value

In [None]:
m2

In [None]:
#m2.optimize()

In [None]:
plot(m2, X, Y)

Prior by itself

In [None]:
dataset = data

# Filter the relevant data
filtered_data = dataset[dataset['f0'] == 19]

# Get X and Y
X = array_to_matrix(filtered_data['f3'])
Y = array_to_matrix(filtered_data['f4'])

n = 68 #68
Xe = X[1:n] 
Ye = Y[1:n]

# New model
l = GPflow.kernels.Linear(1)
p = GPflow.kernels.PeriodicKernel(1)
r = GPflow.kernels.RBF(1)
m2 = GPflow.gpr.GPR(Xe, Ye, kern=(l*r+p))

m2.optimize()

plot(m2, X, Y)

In [None]:
lml(m2)

In [None]:
m

In [None]:
m2