In [1]:
import pandas as pd
import numpy as np
import random as rand

In [2]:
raw = pd.read_csv("yellow-taxis/1january.csv")

In [3]:
# delete unused columns
del raw['trip_distance']
del raw['passenger_count']
del raw['fare_amount']
del raw['tolls_amount']
del raw['taxes_amount']
del raw['tip_amount']
del raw['payment_amount']
del raw['payment_type']

In [4]:
# convert pickup_dtatetime to datetime
raw["pickup_datetime"] = pd.to_datetime(raw["pickup_datetime"])
raw["dropoff_datetime"] = pd.to_datetime(raw["dropoff_datetime"])

In [5]:
#make sure types are okay
print(list(raw.columns.values))
print([raw[i].dtype for i in list(raw.columns.values)])

['pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude']
[dtype('<M8[ns]'), dtype('float64'), dtype('float64'), dtype('<M8[ns]'), dtype('float64'), dtype('float64')]


In [6]:
#convert it to a numpy matrix
#np_raw = raw.as_matrix()

In [7]:
#print(np_raw.dtype)
print(raw.ix[0])
print(len(raw))

pickup_datetime      2016-01-01 00:00:00
pickup_latitude                  40.7347
pickup_longitude                -73.9904
dropoff_datetime     2016-01-01 00:00:00
dropoff_latitude                 40.7324
dropoff_longitude               -73.9818
Name: 0, dtype: object
10906858


In [8]:
from State import State
class MarkovChain:
    # num centers are we picking for k-means
    k = 10
    def __init__(self, raw):
        self.state_set = set()
        self.id_to_state = {}
        self.adj_matrix = None
        self.raw = raw
        
        self.find_optimal_k()
        
        self.initialize_centers(self.k)
        epsilon = 1e-9
        self.build_states_kmeans(1000, epsilon)
        
        self.add_points_edges()
        self.make_adjacency_matrix()
    
#     def find_optimal_k(self):
#         []
    
    def initialize_centers(self, k):
        ind = [i for i in range(len(self.raw))]
        rand.shuffle(ind)
        centers = ind[:k]
        # initialize centers
        ident = 0
        for c_ind in centers:
            # out of convenience, we aren't messing with pickup lat lon
            lat = self.raw.ix[c_ind]["dropoff_latitude"]
            lon = self.raw.ix[c_ind]["dropoff_longitude"]
            s = State((lat, lon), ident)
            self.state_set.add(s)
            self.id_to_state[ident] = s
            ident += 1
    
    def build_states_kmeans(self, iterations, epsilon):
        # run kmeans algorithm
        max_diff = 0
        while iterations > 0 and max_diff > epsilon:
            for ind, row in self.raw.iterrows():
                pos_start, pos_end = self.row_to_positions(row)
                closest_to_start = self.find_closest_state(pos_start)
                closest_to_end = self.find_closest_state(pos_end)
                
                closest_to_start.add_position(pos_start)
                closest_to_end.add_position(pos_end)
            max_diff = 0
            for s in self.state_set:
                max_diff = max(max_diff, s.update_center())
        print(iterations) 

    def add_points_edges(self):
        for s in self.state_set:
            s.clear_stored_data()
        for ind, row in self.raw.iterrows():
            pos_start, pos_end = self.row_to_positions(row)
            closest_to_start = self.find_closest_state(pos_start)
            closest_to_end = self.find_closest_state(pos_end)
            
            fare = self.row_to_fare(row)
            tdistance = self.row_to_distance(row)
            
            #Add points to respective states
            closest_to_start.store_data((pos_start, fare, tdistance))
            closest_to_end.store_data((pos_end))
            
            ##Add this edge to markov state
            closest_to_start.add_destination(closest_to_end.id)
    
    
    def make_adjacency_matrix(self):
        self.adj_matrix = np.ndarray(shape=(len(self.state_set), len(self.state_set)), dtype=float, order='C')
        for i in sorted(self.id_to_state.keys()):
            for j in sorted(self.id_to_state.keys()):
                self.adj_matrix[i][j] = self.transition_probability(i, j)
        
    
    ##
    # GETTERS
    ##
    def get_adjacency_matrix(self):
        return self.adj_matrix
    
    ###
    # HELPER METHODS
    ###
    def find_closest_state(self, pos):
        def distance(state, pos):
            clat, clon = state.center
            return ((clat - pos[0])**2 + (clon - pos[1])**2)**0.5
        closest = None
        min_dist = None
        for state in self.state_set:
            d = distance(state, pos)
            if closest == None or d < min_dist:
                closest = state
                min_dist = d
        return closest
    
    def row_to_positions(self, row):
        lats = row["pickup_latitude"]
        lons = row["pickup_longitude"]
        pos_start = (lats, lons)

        late = row["dropoff_latitude"]
        lone = row["dropoff_longitude"]
        pos_end = (late, lone)
        
        return pos_start, pos_end
    
    def row_to_fare(self, row):
        return row["fare_amount"]
    def row_to_distance(self, row):
        return row["trip_distance"]
    
    def transition_probability(self, i, j):
        return self.id_to_state[i].probability_to(j)


In [9]:
m = MarkovChain(raw[:1000])

1000


In [10]:
adjm = m.get_adjacency_matrix()

In [11]:
np.sum(adjm, axis=1)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [12]:
S, U = np.linalg.eig(adjm.T)

In [13]:
print(U.T)

[[ 0.55315859+0.j          0.49349245+0.j          0.15747693+0.j
   0.13422643+0.j          0.12811893+0.j          0.50109092+0.j
   0.24676390+0.j          0.22184944+0.j          0.13814030+0.j
   0.10471934+0.j        ]
 [ 0.10425707+0.j         -0.93417389+0.j          0.09911883+0.j
   0.09044543+0.j         -0.00942470+0.j          0.22853887+0.j
   0.12779319+0.j          0.12395325+0.j          0.09051608+0.j
   0.07897585+0.j        ]
 [ 0.83734684+0.j          0.11041494+0.j         -0.21056009+0.j
  -0.15499751+0.j         -0.14830316+0.j          0.16656596+0.j
  -0.09357585+0.j         -0.38557946+0.j         -0.10409987+0.j
  -0.01721180+0.j        ]
 [-0.46160570+0.j          0.09437083+0.j         -0.23816645+0.j
   0.11988994+0.j          0.12654285+0.j          0.61124211+0.j
   0.34102152+0.j         -0.10213848+0.j         -0.43258380+0.j
  -0.05857283+0.j        ]
 [-0.21964062-0.19568611j  0.06528326+0.10921023j -0.30549779-0.00133182j
   0.54438585+0.j         

In [14]:
# U is row major
inv_dist = U.T[0]
print(inv_dist)

[ 0.55315859+0.j  0.49349245+0.j  0.15747693+0.j  0.13422643+0.j
  0.12811893+0.j  0.50109092+0.j  0.24676390+0.j  0.22184944+0.j
  0.13814030+0.j  0.10471934+0.j]


In [15]:
norm_inv_dist = inv_dist / float(sum(inv_dist))

  if __name__ == '__main__':


In [16]:
inv2 = sorted(norm_inv_dist)

In [17]:
old_norm_inv_dist = sorted(norm_inv_dist)

In [18]:
old_norm_inv_dist

[(0.039088424357382974+0j),
 (0.047822751995688396+0j),
 (0.050102487051565926+0j),
 (0.051563409956903644+0j),
 (0.058781165611778805+0j),
 (0.082809390578115802+0j),
 (0.092109172220886437+0j),
 (0.18420514623926404+0j),
 (0.18704141779422603+0j),
 (0.20647663419418782+0j)]