In [1]:
import pandas as pd
import numpy as np
import random as rand

In [2]:
raw = pd.read_csv("yellow-taxis/1january.csv")

In [3]:
# delete unused columns
del raw['trip_distance']
del raw['passenger_count']
del raw['fare_amount']
del raw['tolls_amount']
del raw['taxes_amount']
del raw['tip_amount']
del raw['payment_amount']
del raw['payment_type']

In [4]:
# convert pickup_dtatetime to datetime
raw["pickup_datetime"] = pd.to_datetime(raw["pickup_datetime"])
raw["dropoff_datetime"] = pd.to_datetime(raw["dropoff_datetime"])

In [5]:
#make sure types are okay
print(list(raw.columns.values))
print([raw[i].dtype for i in list(raw.columns.values)])

['pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude']
[dtype('<M8[ns]'), dtype('float64'), dtype('float64'), dtype('<M8[ns]'), dtype('float64'), dtype('float64')]


In [6]:
#convert it to a numpy matrix
#np_raw = raw.as_matrix()

In [7]:
print(np_raw.dtype)
print(raw.ix[0])
print(len(raw))

object
pickup_datetime      2016-01-01 00:00:00
pickup_latitude                  40.7347
pickup_longitude                -73.9904
dropoff_datetime     2016-01-01 00:00:00
dropoff_latitude                 40.7324
dropoff_longitude               -73.9818
Name: 0, dtype: object
10906858


In [16]:
from State import State
class MarkovChain:
    # num centers are we picking for k-means
    k = 10
    def __init__(self, raw):
        self.state_set = set()
        self.id_to_state = {}
        self.adj_matrix = None
        self.raw = raw
        
        self.initialize_centers(self.k)
        self.build_states_kmeans(1000)
        self.add_edges()
        self.make_adjacency_matrix()
        

    def initialize_centers(self, k):
        ind = [i for i in range(len(self.raw))]
        rand.shuffle(ind)
        centers = ind[:k]
        # initialize centers
        ident = 0
        for c_ind in centers:
            # out of convenience, we aren't messing with pickup lat lon
            lat = self.raw.ix[c_ind]["dropoff_latitude"]
            lon = self.raw.ix[c_ind]["dropoff_longitude"]
            s = State((lat, lon), ident)
            self.state_set.add(s)
            self.id_to_state[ident] = s
            ident += 1
    
    def build_states_kmeans(self, iterations):
        # run kmeans algorithm
        for it in range(iterations):
            for ind, row in self.raw.iterrows():
                pos_start, pos_end = self.row_to_positions(row)
                closest_to_start = self.find_closest_state(pos_start)
                closest_to_end = self.find_closest_state(pos_end)
                
                closest_to_start.add_position(pos_start)
                closest_to_end.add_position(pos_end)
            for s in self.state_set:
                s.update_center()
        
    def add_edges(self):
        for ind, row in self.raw.iterrows():
            pos_start, pos_end = self.row_to_positions(row)
            closest_to_start = self.find_closest_state(pos_start)
            closest_to_end = self.find_closest_state(pos_end)

            ##Add this edge to markov state
            closest_to_start.add_destination(closest_to_end.id)
    
    
    def make_adjacency_matrix(self):
        self.adj_matrix = np.ndarray(shape=(len(self.state_set), len(self.state_set)), dtype=float, order='C')
        for i in sorted(self.id_to_state.keys()):
            for j in sorted(self.id_to_state.keys()):
                self.adj_matrix[i][j] = self.transition_probability(i, j)
        
    
    ##
    # GETTERS
    ##
    def get_adjacency_matrix(self):
        return self.adj_matrix
    
    ###
    # HELPER METHODS
    ###
    def find_closest_state(self, pos):
        def distance(state, pos):
            clat, clon = state.center
            return ((clat - pos[0])**2 + (clon - pos[1])**2)**0.5
        closest = None
        min_dist = None
        for state in self.state_set:
            d = distance(state, pos)
            if closest == None or d < min_dist:
                closest = state
                min_dist = d
        return closest
    
    def row_to_positions(self, row):
        lats = row["pickup_latitude"]
        lons = row["pickup_longitude"]
        pos_start = (lats, lons)

        late = row["dropoff_latitude"]
        lone = row["dropoff_longitude"]
        pos_end = (late, lone)
        
        return pos_start, pos_end
    
    def transition_probability(self, i, j):
        return self.id_to_state[i].probability_to(j)


In [57]:
m = MarkovChain(raw[:1000])

In [58]:
adjm = m.get_adjacency_matrix()

In [59]:
np.sum(adjm, axis=1)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [60]:
S, U = np.linalg.eig(adjm.T)

In [61]:
print(U.T)

[[  0.00000000e+00 +0.00000000e+00j   0.00000000e+00 +0.00000000e+00j
    0.00000000e+00 +0.00000000e+00j   0.00000000e+00 +0.00000000e+00j
    0.00000000e+00 +0.00000000e+00j   0.00000000e+00 +0.00000000e+00j
    0.00000000e+00 +0.00000000e+00j   1.00000000e+00 +0.00000000e+00j
    0.00000000e+00 +0.00000000e+00j   0.00000000e+00 +0.00000000e+00j]
 [  1.70478868e-01 +0.00000000e+00j   1.92122145e-01 +0.00000000e+00j
    3.79423182e-03 +0.00000000e+00j   9.09437689e-02 +0.00000000e+00j
    9.05368445e-02 +0.00000000e+00j   2.06262230e-01 +0.00000000e+00j
    4.04409463e-03 +0.00000000e+00j  -9.26984487e-01 +0.00000000e+00j
    5.76043140e-02 +0.00000000e+00j   1.11197990e-01 +0.00000000e+00j]
 [  2.21152673e-01 +0.00000000e+00j   2.55189027e-01 +0.00000000e+00j
    6.34671060e-03 +0.00000000e+00j   1.46289905e-01 +0.00000000e+00j
    8.16839458e-04 +0.00000000e+00j   1.66742708e-01 +0.00000000e+00j
   -9.06927356e-01 +0.00000000e+00j  -4.38056245e-02 +0.00000000e+00j
    9.12575804e-02

In [56]:
# U is row major
inv_dist = U.T[0]
print(inv_dist)

[ 1.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j
  0.+0.j]


In [46]:
norm_inv_dist = inv_dist / float(sum(inv_dist))

  if __name__ == '__main__':


In [52]:
inv2 = sorted(norm_inv_dist)

In [27]:
old_norm_inv_dist = sorted(norm_inv_dist)

In [49]:
old_norm_inv_dist

[0.003753188971443896,
 0.0037661874147916323,
 0.036396375154513827,
 0.060658906040522306,
 0.09399390751987341,
 0.096678904976299648,
 0.12510792201095949,
 0.17551245797773041,
 0.1928496884237593,
 0.21128246151010613]