In [44]:
import pandas as pd
import numpy as np
import random as rand

In [4]:
raw = pd.read_csv("yellow-taxis/1january.csv")

In [23]:
# delete unused columns
del raw['trip_distance']
del raw['passenger_count']
del raw['fare_amount']
del raw['tolls_amount']
del raw['taxes_amount']
del raw['tip_amount']
del raw['payment_amount']
del raw['payment_type']

In [28]:
# convert pickup_dtatetime to datetime
raw["pickup_datetime"] = pd.to_datetime(raw["pickup_datetime"])
raw["dropoff_datetime"] = pd.to_datetime(raw["dropoff_datetime"])

In [29]:
#make sure types are okay
print(list(raw.columns.values))
print([raw[i].dtype for i in list(raw.columns.values)])

['pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude']
[dtype('<M8[ns]'), dtype('float64'), dtype('float64'), dtype('<M8[ns]'), dtype('float64'), dtype('float64')]


In [17]:
#convert it to a numpy matrix
np_raw = raw.as_matrix()

In [45]:
print(np_raw.dtype)
print(raw.ix[0])
print(len(raw))

object
pickup_datetime      2016-01-01 00:00:00
pickup_latitude                  40.7347
pickup_longitude                -73.9904
dropoff_datetime     2016-01-01 00:00:00
dropoff_latitude                 40.7324
dropoff_longitude               -73.9818
Name: 0, dtype: object
10906858


In [47]:
class MarkovChain:
    # num centers are we picking for k-means
    k = 10
    def __init__(self, raw):
        self.state_set = set()
        self.raw = raw
        self.initialize_centers(k)
        self.build_states_kmeans(10)
        self.add_edges()

    def initialize_centers(self, k):
        ind = [i for i in range(len(self.raw))]
        rand.shuffle(ind)
        centers = ind[:k]
        # initialize centers
        for c_ind in centers:
            # out of convenience, we aren't messing with dropoff lat lon
            lat = self.raw.ix[c_ind]["pickup_latitude"]
            lon = self.raw.ix[c_ind]["pickup_longitude"]
            s = State((lat, lon))
            self.state_set.add(s)
    
    def build_states_kmeans(self, iterations):
        # run kmeans algorithm
        for it in iterations:
            for ind, row in self.raw.iterrows():
                pos_start, pos_end = self.row_to_positions(row)
                closest_to_start = find_closest_state(pos_start)
                closest_to_end = find_closest_state(pos_end)
                
                closest_to_start.add_position(pos_start)
                closest_to_end.add_position(pos_end)
            for s in state_set:
                s.update_center()
        
    def add_edges(self):
        for ind, row in self.raw.iterrows():
            pos_start, pos_end = self.row_to_positions(row)
            closest_to_start = find_closest_state(pos_start)
            closest_to_end = find_closest_state(pos_end)

            ##Add this edge to markov state
            closest_to_start.add_dest(closest_to_end)
    
    ###
    # HELPER METHODS
    ###
    def find_closest_state(pos):
        def distance(state, pos):
            clat, clon = state.center
            return ((clat - pos[0])**2 + (clon - pos[1])**2)**0.5
        closest = None
        min_dist = None
        for state in self.state_set:
            d = distance(state, pos)
            if closest == None or d < min_dist:
                closest = state
                min_dist = d
        return closest
    
    def row_to_positions(self, row):
        lats = row["pickup_latitude"]
        lons = row["pickup_longitude"]
        pos_start = (lats, lons)

        late = row["dropoff_latitude"]
        lone = row["dropoff_longitude"]
        pos_end = (late, lone)
        
        return pos_start, pos_end
    
    
#     def buildstates_naive(self):
#         ind = [i for i in range(len(raw))]
#         rand.shuffle(ind)
#         centers = ind[:k]
#         #initialize centers
#         for c_ind in centers:
#             # out of convenience, we aren't messing with dropoff lat lon
#             lat = self.raw.ix[c_ind]["pickup_latitude"]
#             lon = self.raw.ix[c_ind]["pickup_longitude"]
#             s = State((lat, lon))
#             state_set.add(s)
#         for ind, row in self.raw.iterrows():
#             lats = self.raw.ix[c_ind]["pickup_latitude"]
#             lons = self.raw.ix[c_ind]["pickup_longitude"]
#             pos_start = (lats, lons)
            
#             late = self.raw.ix[c_ind]["dropoff_latitude"]
#             lone = self.raw.ix[c_ind]["dropoff_longitude"]
#             pos_end = (late, lone)
            
#             closest_to_start = find_closest_state(pos_start)
#             closest_to_end = find_closest_state(pos_end)
            

