In [1]:
%matplotlib inline
import edward as ed
from edward.models import Normal
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# our functions
import setup
import data
import visualizations
import basis_functions
pd.set_option('float_format', '{:f}'.format)
setup.set_random_seeds(42)
plt.style.use("seaborn-talk")
sns.set_context("talk")

In [17]:
manhattan = data.get_borough_data("data/preprocessed.csv", "Manhattan")
ues_to_msh = data.get_neighborhood_to_neighborhood("Morningside Heights", "Upper East Side-Carnegie Hill", manhattan)

In [1]:
indicator_cols = ["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude", "manhattan_distance", "pickup_hour", "pickup_timestamp"]
y_cols = ["trip_duration"]

x_train_raw, y_train_raw, x_test_raw, y_test_raw = data.train_test_split(ues_to_msh, 0.1, indicator_cols, y_cols)
x_train = data.standardize_cols(x_train_raw)
x_test = data.standardize_cols(x_test_raw)


NameError: name 'data' is not defined

# Box's Loop - Iteration 3

## Model: Gaussian Process

Gaussian processes (GPs) are a supervised machine learning algorithm that measures the similarity between input data points, using a predefined kernel function, to predict the value of an unseen data point. We like to think about GPs as: defining an infinite distribution on functions over a continuous sapce where we observe some data points and want to assign probabilities to all the ways a line could be drawn through those points. The idea is that we hope these lines (or functions) we draw are similar with high probability to that true function which we will never know. 

Drawing a parallel to our polynomial basis GLM example: instead of choosing a polynomial basis function of degree four, with GPs using a kernel function we'd like to infer the true function from our data. Hence, GPs allow us retain the flexibility of capturing non-linearities in our data but accounting for "infinite" numbers of basis functions. 

### Interpretation of Kernel Function

GPs are parameterized by a pre-determined *kernel function* which is positive-semidefinite covariance matrix that calculated distanced between every pair of $N$ observed points. The Kernel function must be a square matrix and allows use to explore *smoothness* and *periodicity* in our observed data. 

In our analysis we will explore a few kernel functions but note there are many popular kernel functions, some more applicable than others and exploring them all was impossible. 

### Model Overview

A GP can be specified entirely by it's Kernel Function and mean (often assumed to 0): 

Where,

**Prior:** $p(f) = GP(\ 0,\ K(x,x',\theta)\ )$ where $\theta$ = *length_scale* $l_f$, sigma $\sigma_f$

**Likelihood:**  $p(y\ |\ f,\ X,\ \theta) = GP(\ f,\ K(x,x', \theta)\ )$

In [21]:
n_test, d_test = x_test.shape
y_n, y_d = y_train_raw.shape
N, D = x_train.shape

x_test_gp = tf.placeholder(tf.float32, [n_test, d_test])
x_gp = tf.placeholder(tf.float32, [N, D])
y_gp = tf.placeholder(tf.float32, [y_n, y_d])

mu, var = tf.nn.moments(y_gp, 1)

v_i = tf.eye(N) * var

k = ed.rbf(x_gp, lengthscale=0.9996, variance=1.0) + v_i

k_star = ed.rbf(x_test_gp, x_gp)

k_star_star = ed.rbf(x_test_gp)

k_inv = tf.matrix_inverse(k)

mu_star = tf.matmul(tf.matmul(k_star, k_inv), y_gp)

tmp1 = tf.matmul(k_star, k_inv)

tmp2 = tf.matmul(tmp1, k_star, transpose_b=True)

v_star = k_star_star - tmp2

L = tf.cholesky(tf.abs(v_star))

p_y = ed.models.MultivariateNormalTriL(loc=mu_star, scale_tril=L)

expected = mu_star.eval(session=tf.Session(), feed_dict={
    x_gp: x_train.as_matrix(),
    y_gp: y_train_raw.as_matrix(),
    x_test_gp: x_test.as_matrix()
})

print("prediction")
print(expected)
print("actual")
print(y_test_raw)
print("error")
err = y_test_raw - expected
print(np.abs(err))
print("mean abs error")
print(np.abs(err).mean())

prediction
[[ 18.26642227]
 [ 26.42837715]
 [  5.73762751]
 [  8.51581955]
 [ 14.535079  ]
 [ -1.86046875]
 [  8.75221634]
 [  8.05523396]
 [  9.51730537]
 [  6.43311548]
 [  1.9517163 ]
 [ 10.27960205]
 [  7.09504509]
 [  6.02735806]
 [ 10.8634882 ]
 [ 17.25853157]
 [ 18.54484558]
 [ 25.27394867]
 [  4.41430855]
 [ 22.28453636]
 [ 11.94912529]
 [ 11.13212872]
 [ 17.81834984]
 [  9.66673183]
 [ 15.47781277]
 [  4.29307079]
 [  8.08767128]
 [ 12.13451099]
 [ 10.48649025]
 [ 22.55121231]
 [  4.89026785]
 [ 17.43288803]
 [ 17.5728569 ]
 [ 21.16965866]
 [ 17.33369827]
 [ -1.63364589]
 [ 23.55616951]
 [ 15.32825565]
 [ 30.81552505]
 [ 19.00355148]
 [ 27.94857597]
 [ 18.97401047]
 [ 10.6826086 ]
 [ 15.84075451]
 [  9.01343346]
 [  8.49697399]
 [ 10.62432957]
 [ 24.44162178]
 [ 14.22798824]
 [ 11.37508106]
 [ 13.18295288]
 [ 14.06598186]
 [  9.37521458]
 [  4.59340715]
 [ 19.54473686]
 [  4.50634336]
 [ 12.23362827]
 [ 27.1785183 ]
 [ 17.46229744]
 [ 14.40890312]
 [ 23.90852928]
 [ 18.6775550

In [6]:
"""
inf_gp_x = tf.placeholder(tf.float32, [N, D])
inf_gp_y = ed.models.MultivariateNormalTriL(loc=tf.zeros(N), scale_tril=tf.cholesky(ed.util.rbf(inf_gp_x)))
inf_gp_qf = ed.models.Normal(loc=tf.Variable(tf.random_normal([N])),
            scale=tf.nn.softplus(tf.Variable(tf.random_normal([N]))))
inference_gp = ed.KLqp({inf_gp_y: inf_gp_qf}, data={inf_gp_x: x_train_standardized_harlem_to_bat_park.as_matrix(),
                                    inf_gp_y: np.reshape(y_train_harlem_to_bat_park.as_matrix(), (y_train_harlem_to_bat_park.shape[0],))})
inference_gp.run(n_iter=100)
"""

100/100 [100%] ██████████████████████████████ Elapsed: 0s | Loss: 40.634


In [8]:
"""
ed.evaluate("mean_absolute_error",
            data={inf_gp_x: x_test_standardized_harlem_to_bat_park.as_matrix(),
                  inf_gp_qf: np.reshape(y_test_harlem_to_bat_park.as_matrix(), (y_test_harlem_to_bat_park.shape[0]))},
             output_key=inf_gp_qf)
"""

TypeError: Key-value pair in data does not have same shape: (32,), (7,)