Donovan, Brian; Work, Dan (2016): New York City Taxi Trip Data (2010-2013). University of Illinois at Urbana-Champaign. https://doi.org/10.13012/J8PN93H8

### Field Descriptions (from paper):

- **medallion**: a permit to operate a yellow taxi cab in New York City, it is effectively a
(randomly assigned) car ID. See also medallions.
- **hack license**: a license to drive the vehicle, it is effectively a (randomly assigned) driver
ID. See also hack license.
- **vender id**: e.g., Verifone Transportation Systems (VTS), or Mobile Knowledge Systems
Inc (CMT), implemented as part of the Technology Passenger Enhancements Project.
- **rate_code**: taximeter rate, see NYCT&L description.
- **store_and_fwd_flag**: unknown attribute.
- **pickup datetime**: start time of the trip, mm-dd-yyyy hh24:mm:ss EDT.
- **dropoff datetime**: end time of the trip, mm-dd-yyyy hh24:mm:ss EDT.
- **passenger count**: number of passengers on the trip, default value is one.
- **trip time in secs**: trip time measured by the taximeter in seconds.
- **trip distance**: trip distance measured by the taximeter in miles.
- **pickup_longitude** and **pickup_latitude**: GPS coordinates at the start of the trip.
- **dropoff longitude** and **dropoff latitude**: GPS coordinates at the end of the trip.

In [1]:
import math 
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def compute_distance(lat1,lon1,lat2,lon2):
    
    def deg2rad(deg):
        return deg * (math.pi/180)
    
    R = 6371  # Radius of the earth in km
    dLat = deg2rad(lat2-lat1)
    dLon = deg2rad(lon2-lon1)
    a = (math.sin(dLat/2) * math.sin(dLat/2) +
         math.cos(deg2rad(lat1)) * math.cos(deg2rad(lat2)) * 
         math.sin(dLon/2) * math.sin(dLon/2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R * c

In [3]:
# Choose an arbitrary day 2013-09-01 to get data from ~385k rides
trip_data = pd.read_csv('2013-09-01_trip_data.csv', index_col=0)
trip_data = trip_data.rename(columns={k: k.strip() for k in trip_data.columns})  # fix column names

# Drop irrelavent columns and NAs
trip_data = trip_data.drop(columns=['store_and_fwd_flag', 'vendor_id', 'rate_code'])
trip_data = trip_data.dropna()

# Transform into datetime objects
trip_data.pickup_datetime = trip_data.pickup_datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
trip_data.dropoff_datetime = trip_data.dropoff_datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

# Reset trip_time_in_secs by calculating from pickup and dropoff datetime (recommended)
trip_data.trip_time_in_secs = trip_data.dropoff_datetime - trip_data.pickup_datetime
trip_data.trip_time_in_secs = trip_data.trip_time_in_secs.apply(lambda x: x.total_seconds())

# Ensure that trip distance is less than straight line distance (recommended)
trip_data.trip_distance = trip_data.trip_distance.apply(lambda x: x*1.60934)  # miles to km conversion
trip_data['straight_line_dist'] = trip_data.apply(lambda x: compute_distance(x.pickup_latitude,
                                                                             x.pickup_longitude,
                                                                             x.dropoff_latitude,
                                                                             x.dropoff_longitude), axis=1)
trip_data = trip_data[trip_data.trip_distance > trip_data.straight_line_dist].drop(columns='straight_line_dist')

# Check for unreasonable travel times [<= 125km/hr or 78mph] (recommended)
trip_data = trip_data[trip_data.trip_time_in_secs > 0] # remove trips of zero time
trip_data = trip_data[~(trip_data.pickup_latitude == 0.0) & 
                      ~(trip_data.pickup_longitude == 0.0) & 
                      ~(trip_data.dropoff_latitude == 0.0) & 
                      ~(trip_data.dropoff_longitude == 0.0)] # remove (0.0 GPS coords.)
trip_data['speed'] = trip_data.apply(lambda x: x.trip_distance / (x.trip_time_in_secs/3600), axis=1)
trip_data = trip_data[trip_data.speed <= 120].drop(columns='speed')

In [4]:
# Export cleaned date to CSV
# trip_data.to_csv('2013-09-01_trip_data_clean.csv')

### Discretize Data

In [5]:
# trip_data = pd.read_csv('2013-09-01_trip_data_clean.csv', index_col=0)
# trip_data.dropoff_datetime = trip_data.dropoff_datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
# trip_data.pickup_datetime = trip_data.pickup_datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

In [6]:
# Get data for NYC intersections
nodes = pd.read_csv('nyc_nodes_manhattan.csv')[['name','lon','lat']]
nodes['lon_rounded'] = nodes.lon.apply(lambda x: round(x,3))
nodes['lat_rounded'] = nodes.lat.apply(lambda x: round(x,3))

In [7]:
# Create a field with a unique trip id
trip_data = trip_data.reset_index().drop(columns='index').reset_index().rename(columns={'index' : 'id'})

In [8]:
# Discretize pickup and dropoff location to one of the NYC street intersection nodes
trip_data['lon_rounded'] = trip_data.pickup_longitude.apply(lambda x: round(x,3))
trip_data['lat_rounded'] = trip_data.pickup_latitude.apply(lambda x: round(x,3))
trip_data = (trip_data.merge(nodes, on=['lon_rounded', 'lat_rounded'])
                      .groupby('id').first()
                      .drop(columns=['lon', 'lat', 'pickup_longitude', 'pickup_latitude'])
                      .rename(columns={'name' : 'start_node'})
                      .reset_index()
                      .rename(columns={'index' : 'id'}))

trip_data['lon_rounded'] = trip_data.dropoff_longitude.apply(lambda x: round(x,3))
trip_data['lat_rounded'] = trip_data.dropoff_latitude.apply(lambda x: round(x,3))
trip_data = (trip_data.merge(nodes, on=['lon_rounded', 'lat_rounded'])
                      .groupby('id').first()
                      .drop(columns=['lon', 'lat', 'dropoff_longitude', 'dropoff_latitude'])
                      .rename(columns={'name' : 'end_node'})
                      .drop(columns=['lon_rounded', 'lat_rounded']))

In [9]:
# Discretize times into minutes
trip_data = trip_data[trip_data.trip_time_in_secs >= 30] # Remove the (very few) trips under 30 seconds
trip_data['start_time'] = (trip_data.pickup_datetime
                          .apply(lambda x: round((x - datetime(2013,9,1,0,0,0)).total_seconds()/60)))
trip_data['trip_time'] = trip_data.trip_time_in_secs.apply(lambda x: round(x/60))

In [10]:
# Only keep needed fields
trip_data = trip_data[['start_node', 'end_node', 'start_time', 'trip_time', 'passenger_count', 'trip_distance']]

In [11]:
# Export discretized form
trip_data.to_csv('2013-09-01_trip_data_manhattan_discrete.csv')