In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point

In [2]:
%qtconsole

# Loading Raw Data

I load the csv yellow and green taxi data for the month of June 2016 and concatenate the files.

In [5]:
filename_green = 'green_tripdata_2016-06.csv'
filename_yellow = 'yellow_tripdata_2016-06.csv'

In [6]:
df_green = pd.read_csv(filename_green, sep=',')
df_yellow = pd.read_csv(filename_yellow, sep=',')

In [7]:
df_yellow.rename(str.lower,axis='columns',inplace=True)
df_yellow.rename(columns={'tpep_pickup_datetime': 'pep_pickup_datetime','tpep_dropoff_datetime': 'pep_dropoff_datetime'},inplace=True)
df_green.rename(str.lower,axis='columns',inplace=True)
df_green.rename(columns={'lpep_pickup_datetime': 'pep_pickup_datetime','lpep_dropoff_datetime': 'pep_dropoff_datetime'},inplace=True)
df_green.drop(columns={'ehail_fee','trip_type '},inplace=True)

In [8]:
df = df_yellow.append(df_green)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Bellow is the first 5 entries in the dataset from http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml

In [9]:
df.head()

Unnamed: 0,dropoff_latitude,dropoff_longitude,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,payment_type,pep_dropoff_datetime,pep_pickup_datetime,pickup_latitude,pickup_longitude,ratecodeid,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,vendorid
0,40.753979,-73.977463,0.5,6.0,0.3,0.5,2,2,2016-06-09 21:13:08,2016-06-09 21:06:36,40.760937,-73.98336,1,N,0.0,0.0,7.3,0.79,2
1,40.670242,-73.981636,0.5,22.0,0.3,0.5,1,1,2016-06-09 21:35:11,2016-06-09 21:06:36,40.736668,-73.98172,1,N,4.0,0.0,27.3,5.22,2
2,40.742168,-74.004234,0.5,6.5,0.3,0.5,1,1,2016-06-09 21:13:10,2016-06-09 21:06:36,40.751072,-73.994316,1,N,1.56,0.0,9.36,1.26,2
3,40.85154,-73.929466,0.5,26.0,0.3,0.5,1,1,2016-06-09 21:36:10,2016-06-09 21:06:36,40.773891,-73.982361,1,N,1.0,0.0,28.3,7.39,2
4,40.766445,-73.985909,0.5,13.5,0.3,0.5,1,1,2016-06-09 21:23:23,2016-06-09 21:06:36,40.733173,-73.987106,1,N,2.96,0.0,17.76,3.1,2


# Load Shapefiles for Manhattan and Astoria

In [2]:
nb = gpd.read_file('nynta_18d/nynta.shp')
nb = nb.to_crs({"init": "epsg:4326"})
astoria = nb.loc[nb['NTAName'] == 'Astoria']
astoria_shape = astoria.geometry[astoria.index[0]]
boroughs = gpd.read_file('Borough Boundaries/geo_export_0de71584-a1b1-48e3-adda-f106fd662625.shp')
manhattan = boroughs.loc[boroughs['boro_name'] == 'Manhattan']
manhattan_shape = manhattan.geometry[manhattan.index[0]]

# Astoria Pick Up

Now select all the data points that have a pickup location in Astroria. The Area of Astoria is as defined by NYC from the shapefiles.

In [12]:
chunk = 100000
div = int(df.shape[0]/chunk)
mod = int(df.shape[0]%chunk)
astoria_df = pd.DataFrame()
print('div: ' + str(div) + ', mod: ' +str(mod))

for i in range(div+1):
    lower = i*chunk
    upper = (i+1)*chunk
    if i == div:
        upper = lower + mod
    myrange = range(lower,upper)
    temp = df.iloc[myrange]
    geometry = [Point(xy) for xy in zip(temp.pickup_longitude, temp.pickup_latitude)]
    astoria_df = astoria_df.append(temp.loc[[astoria_shape.contains(x) for x in geometry]])
    print('loop: ' + str(i),end='\r')
    #adf_y.to_csv('test.csv')
    #adf_y.to_csv('astoria data/astoria_yellow_' + str(i) + '.csv')

div: 125, mod: 40196
loop: 125

In [15]:
astoria_df.to_csv('2016-06 astoria pickup.csv')

## Astoria Dropoff and Manhattan Dropoff

Now having a set of data that corresponds to all the pickups originating in Astoria, I select the subset of data where the dropoff location was in Astoria and also get the subset where the dropoff location was in Manhattan.

In [16]:
geometry = [Point(xy) for xy in zip(astoria_df.dropoff_longitude, astoria_df.dropoff_latitude)]
astoria_astoria_df = astoria_df.loc[[astoria_shape.contains(x) for x in geometry]]
astoria_manhattan_df = astoria_df.loc[[manhattan_shape.contains(x) for x in geometry]]

In [20]:
astoria_astoria_df.to_csv('2016-06 astoria to astoria.csv')
astoria_manhattan_df.to_csv('2016-06 astoria to manhattan.csv')

# Manhattan Pick Up

Now I again take the original dataset for June 2016 and select all the trips where the pickup location originated in Manhattan. As most taxi rides took place in Manhattan I had to parallelize the sorting using 3 processing cores. Using more cores maxed out my available RAM so I was limited to 3 cores. Furthermore, I saved subsets of my desired dataset directly to the hard drive in order to save RAM memory.

In [5]:
from joblib import Parallel, delayed
import multiprocessing

In [1]:
chunk = 10000
div = int(df.shape[0]/chunk)
mod = int(df.shape[0]%chunk)
print('div: ' + str(div) + ', mod: ' +str(mod))

def processInput(i):
    lower = i*chunk
    upper = (i+1)*chunk
    if i == div:
        upper = lower + mod
    myrange = range(lower,upper)
    temp = df.iloc[myrange]
    geometry = [Point(xy) for xy in zip(temp.pickup_longitude, temp.pickup_latitude)]
    temp = temp.loc[[manhattan_shape.contains(x) for x in geometry]]
    temp.to_csv('manhattan/pickup/manhattan_pickup_' + str(i) + '.csv')
 
num_cores = 3
     
Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in range(div+1))

After the above finished which took hours to complete, I loaded each of the csv files and created a dataframe in memory.

In [2]:
#removing unnamed columns
df.loc[:, ~df.columns.str.contains('^Unnamed')]manhattan_df = pd.DataFrame()
for i in range(div+1):
    print('csv file: ' + str(i),end='\r')
    manhattan_df =  manhattan_df.append(pd.read_csv('manhattan/pickup/manhattan_pickup_' + str(i) + '.csv'))




In [11]:
manhattan_df.to_csv('data/2016-06 manhattan pickup.csv')

In [3]:
manhattan_df = pd.read_csv('data/2016-06 manhattan pickup.csv')

## Manhattan Dropoff

With the subset of data corresponding to pickups in Manhattan, I then found the subset of data with the dropoff point being in Manhattan. Again, due to the number of data points I had to parallelize the sorting with 3 processing cores and save shunks of data to the hard drive. And again this took hours to complete.

In [2]:
chunk = 10000
div = int(manhattan_df.shape[0]/chunk)
mod = int(manhattan_df.shape[0]%chunk)
print('div: ' + str(div) + ', mod: ' +str(mod))

def processInput(i):
    lower = i*chunk
    upper = (i+1)*chunk
    if i == div:
        upper = lower + mod
    myrange = range(lower,upper)
    temp = manhattan_df.iloc[myrange]
    geometry = [Point(xy) for xy in zip(temp.dropoff_longitude, temp.dropoff_latitude)]
    temp = temp.loc[[manhattan_shape.contains(x) for x in geometry]]
    temp.to_csv('manhattan/pickup_dropoff/manhattan_pickup_dropoff_' + str(i) + '.csv')
 
num_cores = 3
     
Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in range(div+1))

Again, after the above finished, I loaded each of the csv files and created a dataframe in memory.

In [9]:
manhattan_manhattan_df = pd.DataFrame()
for i in range(div+1):
    print('csv file: ' + str(i),end='\r')
    manhattan_manhattan_df =  manhattan_manhattan_df.append(pd.read_csv('manhattan/pickup_dropoff/manhattan_pickup_dropoff_' + str(i) + '.csv'))

csv file: 1048

In [7]:
manhattan_manhattan_df.to_csv('data/2016-06 manhattan to manhattan.csv',index=False)

## Astoria Dropoff

Using the set of data that corresponds to all the pickups originating in Manhattan, I select the subset of data where the dropoff location was in Astoria.

In [31]:
geometry = [Point(xy) for xy in zip(manhattan_df.dropoff_longitude, manhattan_df.dropoff_latitude)]
manhattan_astoria_df = manhattan_df.loc[[astoria_shape.contains(x) for x in geometry]]

In [33]:
manhattan_astoria_df.to_csv('2016-06 manhattan to astoria.csv')