# Taxi Trips and Traffic

Where most models use real-time data from users to predict arrival
times at any given moment, we believe they could be improved by including a predictive
element. Our intent is to use the NYC Taxi and Limousine Commission's yellow and green cab data set to estimate density of pickup and dropoffs at any given place and time. We will then use the density as a proxy for traffic to estimate the time it takes to arrive at a destination.

In [1]:
%matplotlib inline
import edward as ed
from edward.models import Normal
import pandas as pd
pd.set_option('float_format', '{:f}'.format)
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# our functions
from setup import set_random_seeds
from data import get_borough_data
from visualizations import visualize_by_borough
set_random_seeds(42)
plt.style.use("seaborn-talk")
sns.set_context("talk")

## Data

We used the
[2015 NYC Yellow Cab Dataset](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml),
which consists of pickup and dropoff coordinates for trips, along 
with metadata like cost, distance, and number of passengers.

In [None]:
manhattan = get_borough_data("data/preprocessed.csv", "Manhattan")
indices = np.random.choice(manhattan.shape[0], size=10000, replace=False)
axes = visualize_by_borough(manhattan.iloc[indices, :])
plt.show(axes)

## Model

We will begin by trying to model the trip duration between two neighborhoods. Specifically, Morningside Heights and Upper East Side-Carnegie Hill

In [3]:
def get_neighborhood_to_neighborhood(source_neighborhood, sink_neighborhood, full_dataset):
    x = full_dataset.where((full_dataset["pickup_neighborhood_name"] == source_neighborhood) &
                           (full_dataset["dropoff_neighborhood_name"] == sink_neighborhood)).dropna()
    return x
    
def add_arrival_timestamp(x):
    x["dropoff_timestamp"] = x["pickup_timestamp"] + x["trip_duration"]
    return x

def add_dropoff_datetime(x):
    # This takes a while...
    x["dropoff_datetime"] = x["pickup_datetime"] + x["trip_duration"].apply(lambda x: pd.Timedelta(seconds=x))
    return x

def add_dropoff_hour(x):
    if "dropoff_datetime" not in x.columns:
        x = add_dropoff_datetime(x)
    x["dropoff_hour"] = x["dropoff_datetime"].apply(lambda x: x.hour)
    return x
    
def add_pickup_hour(x):
    x["pickup_hour"] = x["pickup_datetime"].apply(lambda x: x.hour)
    return x

# manhattan = add_arrival_timestamp(manhattan)
# ues_to_msh = get_neighborhood_to_neighborhood("Morningside Heights", "Upper East Side-Carnegie Hill", manhattan)
# ues_to_msh = add_dropoff_hour(ues_to_msh)
# ues_to_msh = add_pickup_hour(ues_to_msh)
# visualize_by_borough(ues_to_msh)

Lets graph the trip duration by hour of day

In [4]:
# note that the selection of the trip_duration column here is 
# arbitrary, the count would be the same regardless of column selected
avg_duration = ues_to_msh.groupby("dropoff_hour")["trip_duration"].mean().to_frame("avg_trip_duration")
avg_duration["hour"] = range(24)
sns.lmplot(x="hour", y="avg_trip_duration", data=avg_duration, fit_reg=False)
plt.show()
sns.lmplot(x="dropoff_hour", y="trip_duration", data=ues_to_msh, fit_reg=False)
plt.show()

NameError: name 'ues_to_msh' is not defined

Need to remove some outliers

In [8]:
def remove_outliers(data, col, frac_stddev):
    bound = frac_stddev * data.describe()[col]["std"]
    return data.where(np.abs(data[col]) < bound).dropna()
# ues_to_msh = remove_outliers(ues_to_msh, "trip_duration", 1.0)

re-graph

In [None]:
sns.lmplot(x="dropoff_hour", y="trip_duration", data=ues_to_msh, fit_reg=False)
plt.show()


For some reason, there are also a bunch of -1's. Maybe missing data?

In [11]:
def remove_leq_zero(data, col):
    return data.where(data[col] > 0).dropna()
# ues_to_msh = remove_leq_zero(ues_to_msh, "trip_duration")

re-graph

In [None]:
sns.lmplot(x="dropoff_hour", y="trip_duration", data=ues_to_msh, fit_reg=False)
plt.show()

Add manhattan distance to the dataset

In [5]:
def add_manhattan_distance(data):
    data["manhattan_distance"] = abs(data["pickup_longitude"] - data["dropoff_longitude"] + \
                                 data["pickup_latitude"] - data["dropoff_latitude"])
    return data

from haversine import haversine

def add_manhattan_distance2(data):
    
    def haversineDistance(row):
        pickup_point = (row['pickup_latitude'], row['pickup_longitude'])
        dropoff_point = (row['dropoff_latitude'], row['dropoff_longitude'])
        # Return distance in miles rounded to 2 decimals
        return round( haversine(pickup_point, dropoff_point, miles=True), 2)
    
    data['manhattan_distance'] = data.apply(lambda row: haversineDistance(row),axis=1)
    return data[(0 < data.manhattan_distance) & (data.manhattan_distance < 50)]

def add_trip_speed(data):
    data['trip_speed_mph'] = data['manhattan_distance'] / data['trip_duration']
    return data[(0 < data.trip_speed_mph) & (data.trip_speed_mph < 150)]

# def add_day_of_week(data, includeNames=False):
#     dayDict = ({0:'Monday',1:'Tuesday',2:'Wednesday',
#                 3:'Thursday', 4:'Friday',5:'Saturday',6:'Sunday'})
        
#     if includeNames:
#         data['day_idx'] = pd.to_datetime(df.pickup_date, format='%Y-%m-%d').dt.dayofweek
#         data['weekday'] = data.loc[:, ('day_idx')].apply(lambda row: dayDict.get(row))
#     else:
#         data['day'] = pd.to_datetime(data.pickup_date, format='%Y-%m-%d').dt.dayofweek
#     return data

# def add_all_time(data):
#     data['pickup_datetime'] = pd.to_datetime(data.loc['pickup_datetime'])
#     data['pickup_date'] = data.loc['pickup_datetime'].dt.date
#     data['pickup_time'] = data.loc['pickup_datetime'].dt.time
#     return data



In [42]:
manhattan = get_borough_data("data/preprocessed.csv", "Manhattan")
manhattan = manhattan[0:5]

In [43]:
#manhattan = add_arrival_timestamp(manhattan)
#manhattan = add_pickup_hour(manhattan)
#manhattan = remove_outliers(manhattan, "trip_duration", 1.0)
#manhattan = remove_leq_zero(manhattan, "trip_duration")
#manhattan = add_manhattan_distance2(manhattan)
#manhattan = add_day_of_week(manhattan)

x_ = manhattan.loc[:, ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]]

# x_["pickup_longitude"] = x_.pickup_longitude.round(3)
# x_["pickup_latitude"] = x_.pickup_longitude.round(3)
# x_["dropoff_longitude"] = x_.pickup_longitude.round(3)
# x_["dropoff_latitude"] = x_.pickup_longitude.round(3)
x_

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,-73.982155,40.767937,-73.96463,40.765602
1,-73.980415,40.738564,-73.999481,40.731152
2,-73.979027,40.763939,-74.005333,40.710087
3,-74.01004,40.719971,-74.012268,40.706718
4,-73.973053,40.793209,-73.972923,40.78252


In [44]:
y_ = manhattan.loc[:, ["pickup_datetime"]]
y_

Unnamed: 0,pickup_datetime
0,2016-03-14 17:24:55
1,2016-06-12 00:43:35
2,2016-01-19 11:35:24
3,2016-04-06 19:32:31
4,2016-03-26 13:30:55


In [168]:
from scipy.spatial.distance import pdist, squareform

def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

# this is an NxD matrix, where N is number of items and D its dimensionalites
sigma = 1.0
n = x_.shape[0]
pairwise_sq_dists = squareform(pdist(x_, 'sqeuclidean'))
print("\n pairwise \n", pairwise_sq_dists)

print("\n exp pairwise \n", np.exp(-pairwise_sq_dists))
K = np.exp(-pairwise_sq_dists / sigma**2)

L = np.linalg.cholesky(K + 1e-15*np.eye(n))
print("\n cholesky \n", L)



 pairwise 
 [[ 0.          0.00326725  0.00476443  0.00881499  0.00107654]
 [ 0.00326725  0.          0.0011238   0.00198381  0.00638441]
 [ 0.00476443  0.0011238   0.          0.00295448  0.00718942]
 [ 0.00881499  0.00198381  0.00295448  0.          0.01402585]
 [ 0.00107654  0.00638441  0.00718942  0.01402585  0.        ]]

 exp pairwise 
 [[ 1.          0.99673809  0.9952469   0.99122374  0.99892404]
 [ 0.99673809  1.          0.99887683  0.99801815  0.99363593]
 [ 0.9952469   0.99887683  1.          0.99704988  0.99283636]
 [ 0.99122374  0.99801815  0.99704988  1.          0.98607205]
 [ 0.99892404  0.99363593  0.99283636  0.98607205  1.        ]]

 cholesky 
 [[ 1.          0.          0.          0.          0.        ]
 [ 0.99673809  0.08070432  0.          0.          0.        ]
 [ 0.9952469   0.0852041   0.04715797  0.          0.        ]
 [ 0.99122374  0.12425229 -0.00104501  0.04511947  0.        ]
 [ 0.99892404 -0.02514992  0.01703172 -0.02088771  0.0281401 ]]


In [115]:
sess = tf.InteractiveSession()

In [203]:
# Not correct -- see pairwise below
def rbf_kernel(X, gamma=-1.0):
    # Gaussian (RBF) kernel k(X,X')
    X = tf.convert_to_tensor(X, dtype=tf.float32)
    gamma = tf.constant(gamma)
    sq_vec = tf.multiply(2., tf.matmul(X, tf.transpose(X)))
    kernel = tf.exp(tf.multiply(gamma, tf.abs(sq_vec)))
    return kernel

from scipy.spatial.distance import pdist, squareform

def pairwise_sq_dist_kernel(X, sigma = 1.0):
    pairwise_sq_dists = squareform(pdist(x_, 'sqeuclidean'))
    K = np.exp(-pairwise_sq_dists / sigma**2)
    return tf.convert_to_tensor(K)

In [213]:
# Not needed but literally checked to make sure the data subset was PSD
# since tf.cholesky requires square + PSD matrix
def is_positive_definite(x):
    print("Is postive semi def: ",np.all(np.linalg.eigvals(x) > 0))

a = x_.round(2)
print("X input:\n", a)

b = tf.convert_to_tensor( x_.round(2) )
Y = pairwise_sq_dist_kernel(b)
Y.eval()

#chol = tf.cholesky(tf.convert_to_tensor(Y))
#chol.eval()

f = MultivariateNormalTriL(loc=tf.cast(tf.zeros(5), tf.float64), scale_tril=tf.cholesky(tf.convert_to_tensor(Y)))
f.eval()


X input:
 [[-73.98  40.77 -73.96  40.77]
 [-73.98  40.74 -74.    40.73]
 [-73.98  40.76 -74.01  40.71]
 [-74.01  40.72 -74.01  40.71]
 [-73.97  40.79 -73.97  40.78]]


array([ 1.78826945,  1.72643273,  1.76500818,  1.62409339,  1.82587397])

In [None]:
from observations import crabs

data, metadata = crabs("~/data")
X_train = data[:100, 3:]
y_train = data[:100, 1]

print(X_train)
N = X_train.shape[0]  # number of data points
D = X_train.shape[1]  # number of features

print("Number of data points: {}".format(N))
print("Number of features: {}".format(D))

In [160]:
X_train.dtype

K = ed.rbf(tf.cast(X_train, tf.float32))
# # print("edward rbf")
# print(type(K))
K.eval()
# chol = tf.cholesky(K)
# chol.eval()

array([[  1.00000000e+00,   1.17370207e-02,   1.07571341e-05, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.17370207e-02,   1.00000000e+00,   1.62827581e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.07571341e-05,   1.62827581e-01,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.00000000e+00,   7.67101049e-01,   1.13379360e-06],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          7.67101049e-01,   9.99511838e-01,   2.76712344e-05],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.13379360e-06,   2.76712344e-05,   1.00000000e+00]], dtype=float32)

In [131]:
from edward.models import Bernoulli, MultivariateNormalTriL
from edward.util import rbf

X = tf.placeholder(tf.float32, [N, D])
f = MultivariateNormalTriL(loc=tf.zeros(N), scale_tril=tf.cholesky(rbf(X)))
y = Bernoulli(logits=f)

qf = Normal(loc=tf.Variable(tf.random_normal([N])),
            scale=tf.nn.softplus(tf.Variable(tf.random_normal([N]))))

inference = ed.KLqp({f: qf}, data={X: X_train, y: y_train})
inference.run(n_iter=100)

InvalidArgumentError: Cholesky decomposition was not successful. The input might not be valid.
	 [[Node: inference_2/sample_2/Cholesky_22 = Cholesky[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](inference_2/sample_2/mul_117)]]

Caused by op 'inference_2/sample_2/Cholesky_22', defined at:
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2808, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-131-0e71642c93d2>", line 13, in <module>
    inference.run(n_iter=100)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/inferences/inference.py", line 123, in run
    self.initialize(*args, **kwargs)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/inferences/klqp.py", line 107, in initialize
    return super(KLqp, self).initialize(*args, **kwargs)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/inferences/variational_inference.py", line 68, in initialize
    self.loss, grads_and_vars = self.build_loss_and_gradients(var_list)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/inferences/klqp.py", line 146, in build_loss_and_gradients
    return build_reparam_loss_and_gradients(self, var_list)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/inferences/klqp.py", line 618, in build_reparam_loss_and_gradients
    qx_copy = copy(qx, scope=scope)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/util/random_variables.py", line 228, in copy
    copy(v, dict_swap, scope, True, copy_q, True)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/util/random_variables.py", line 244, in copy
    value, dict_swap, scope, True, copy_q, False)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/util/random_variables.py", line 86, in _copy_default
    x = copy(x, *args, **kwargs)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/util/random_variables.py", line 268, in copy
    new_op = copy(op, dict_swap, scope, True, copy_q, False)
  File "/Users/colbywise/Desktop/ProbablisticProgramming/ProbabilisticProgrammingProject/src/edward/edward/util/random_variables.py", line 314, in copy
    op_def)
  File "/Users/colbywise/anaconda2/envs/pp/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Cholesky decomposition was not successful. The input might not be valid.
	 [[Node: inference_2/sample_2/Cholesky_22 = Cholesky[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](inference_2/sample_2/mul_117)]]


In [None]:
#manhattan = add_all_time(manhattan)
manhattan = manhattan.dropna()
x = manhattan.drop(["Unnamed: 0","id", "vendor_id", 
                             "store_and_fwd_flag",
                             "pickup_boro_code", 
                             "dropoff_boro_code",
                             "dropoff_neighborhood_code",
                             "pickup_neighborhood_code",
                             "pickup_datetime",
                             "pickup_neighborhood_name",
                             "dropoff_neighborhood_name",
                             "pickup_boro",
                             "dropoff_boro",
                             "dropoff_timestamp"],
                              axis=1)

y = manhattan["trip_duration"]
manhattan.shape

In [None]:
# the timestamp value was too large and causing divergence
x.loc[:, "pickup_timestamp"] = x["pickup_timestamp"] - x["pickup_timestamp"].mean()
x.reset_index(inplace=True, drop=True)
# note that y's indices need to be reset differently because it is actually a series rather than dataframe
y = y.reset_index(drop=True)

In [None]:
train_indices = np.random.choice(x.shape[0], size=int(x.shape[0] * 0.1), replace=False)

x_test = x.iloc[train_indices, :]
y_test = y.iloc[train_indices]


x_train = x.drop(train_indices)
y_train = y.drop(train_indices)

# convert y to minutes instead of seconds
y_train = y_train / 60
y_test = y_test / 60

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)
# print("\n x_train\n", x_train[0:5])
# print("\n y_train\n", y_train[0:5])

In [None]:
N, D = x_train.shape
X = tf.placeholder(tf.float32, [None, D])
w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
b = Normal(loc=[0.0], scale=1.0)
Y = Normal(loc=ed.dot(X, w) + b, scale=1.0)

qw = ed.models.NormalWithSoftplusScale(loc=tf.Variable(tf.random_normal([D])),
                                       scale=tf.Variable(tf.random_normal([D])))

qb = ed.models.NormalWithSoftplusScale(loc=tf.Variable(tf.random_normal([1])),
                                       scale=tf.Variable(tf.random_normal([1])))

inference = ed.KLqp({w: qw, b: qb}, data={X: x_train.as_matrix(), Y: y_train.as_matrix()})
inference.run(n_iter=10000)

In [None]:
y_post = Normal(loc=ed.dot(X, qw) + qb, scale=1.0)
# only manhattan distance =>  776342.81
# + pickup timestamp      => 7317449.0
# - pickup timestamp
# + pickup hour           =>  225920.56
# + passenger count       =>  222533.69
ed.evaluate("mean_absolute_error", data={X: x_test.as_matrix(),
                                        y_post: y_test.as_matrix()})

Graph distance to trip duration

In [None]:
sns.lmplot(x="manhattan_distance", y="trip_duration", data=ues_to_msh, fit_reg=True)
plt.show()

Lets model this with a simple GLM

In [None]:
ues_to_msh = ues_to_msh.dropna()
x = ues_to_msh.drop(["Unnamed: 0","id", "vendor_id", "pickup_datetime",
                             "store_and_fwd_flag", "trip_duration",
                             "pickup_boro", "pickup_boro_code", "pickup_neighborhood_name",
                             "pickup_neighborhood_code", "dropoff_boro", "dropoff_boro_code",
                             "dropoff_neighborhood_name", "dropoff_neighborhood_code",
                             "dropoff_timestamp", "dropoff_datetime",
                             "dropoff_hour"], axis=1)
y = ues_to_msh["trip_duration"]

# the timestamp value was too large and causing divergence
x.loc[:, "pickup_timestamp"] = x["pickup_timestamp"] - x["pickup_timestamp"].mean()
x.reset_index(inplace=True, drop=True)
# note that y's indices need to be reset differently because it is actually a series rather than dataframe
y = y.reset_index(drop=True)

In [None]:
print(x.columns)

In [None]:
train_indices = np.random.choice(x.shape[0], size=int(x.shape[0] * 0.1), replace=False)

x_train = x.iloc[train_indices, :]
train_ind_5pm = x_train.where(x["pickup_hour"] == 5).dropna()
print(train_ind_5pm)
return
x_train_5pm_pickup = x[train_ind_5pm]
y_train = y.iloc[train_indices]


x_test = x.drop(train_indices)
y_test = y.drop(train_indices)

# convert y to minutes instead of seconds
y_train = y_train / 60
y_test = y_test / 60

x_train = x_train.loc[:, ["manhattan_distance", "pickup_hour", "passenger_count", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]]
# x_train = x_train.loc[:, ["manhattan_distance", "pickup_hour", "passenger_count", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]]
x_test = x_test.loc[:, ["manhattan_distance", "pickup_hour", "passenger_count", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]]

N, D = x_train.shape
X = tf.placeholder(tf.float32, [None, D])
w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
b = Normal(loc=[0.0], scale=1.0)
Y = Normal(loc=ed.dot(X, w) + b, scale=1.0)

qw = ed.models.NormalWithSoftplusScale(loc=tf.Variable(tf.random_normal([D])),
                                       scale=tf.Variable(tf.random_normal([D])))

qb = ed.models.NormalWithSoftplusScale(loc=tf.Variable(tf.random_normal([1])),
                                       scale=tf.Variable(tf.random_normal([1])))

inference = ed.KLqp({w: qw, b: qb}, data={X: x_train.as_matrix(), Y: y_train.as_matrix()})
inference.run(n_iter=10000)

In [None]:
sess = tf.InteractiveSession()

In [None]:
from scipy.spatial.distance import pdist, squareform
  # this is an NxD matrix, where N is number of items and D its dimensionalites
sigma = 1.0
pairwise_sq_dists = squareform(pdist(x_train, 'sqeuclidean'))
print(pairwise_sq_dists)

print(np.exp(-pairwise_sq_dists))
K = np.exp(-pairwise_sq_dists / sigma**2)
L = np.linalg.cholesky(K + 1e-15*np.eye(n))
print(L)
print(x_train.shape)

K = ed.rbf(tf.cast(x_train, tf.float32))
K.eval()


In [None]:

from edward.models import Bernoulli, MultivariateNormalTriL
from edward.util import rbf


N, D = x_train.shape

X = tf.placeholder(tf.float32, [N, D])
#f = MultivariateNormalTriL(loc=tf.zeros(N), scale_tril=tf.cholesky(rbf(tf.cast(x_train, tf.float32))))
f = MultivariateNormalTriL(loc=tf.zeros(N), scale_tril=(tf.cast(L, tf.float32)))
Y = Normal(loc=tf.zeros(N), scale=f)

qf = ed.models.NormalWithSoftplusScale(loc=tf.Variable(tf.random_normal([N])),
                                       scale=tf.Variable(tf.random_normal([N])))


inference = ed.KLqp({f: qf}, data={X: x_train.as_matrix(), Y: y_train.as_matrix()})
inference.run(n_iter=500)

Check the MSE

In [None]:
y_post = Normal(loc=ed.dot(X, qw) + qb, scale=1.0)
# only manhattan distance =>  776342.81
# + pickup timestamp      => 7317449.0
# - pickup timestamp
# + pickup hour           =>  225920.56
# + passenger count       =>  222533.69
ed.evaluate("mean_absolute_error", data={X: x_test.as_matrix(),
                                        y_post: y_test.as_matrix()})


### Results of a Linear Model
On average we have an absolute error of about 4:30 for each trip