In [12]:

import numpy as np
import pandas as pd
import sys, os
rawdir = "../data/raw/"
processeddir = "../data/processed/"
tripdatadf = pd.read_csv(os.path.join(rawdir, "trip_data_4.csv"), dtype={" store_and_fwd_flag": "object"})
tripfaredf = pd.read_csv(os.path.join(rawdir, "trip_fare_4.csv"))

In [13]:
# strip spaces from start of col headers
tripdatadf.columns = list(map(str.lstrip, tripdatadf.columns))
tripfaredf.columns = list(map(str.lstrip, tripfaredf.columns))


In [14]:
#check nrows matches
tripdatadf.shape[0] == tripfaredf.shape[0]

True

In [15]:
#join on hack license, medallion and pickup_datetime

df = tripdatadf.merge(tripfaredf, on=["medallion", "hack_license", "pickup_datetime"])

#look for failed matches

assert(np.sum(pd.isnull(df["rate_code"]))==0)
assert(np.sum(pd.isnull(df["total_amount"]))==0)

In [16]:
#coerce datetimes
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])
df["dropoff_datetime"] = pd.to_datetime(df["dropoff_datetime"])

#add 15% cash tip to cash tip trips
row_indexer = df["tip_amount"]==0 #we are modifying this condition so need to save which rows first
df.loc[row_indexer, "tip_amount"] = df.loc[row_indexer, "fare_amount"] * .15
df.loc[row_indexer, "total_amount"] += df.loc[row_indexer, "fare_amount"] * .15

In [17]:
# drop rows with no dropoff location and trip distance zero
df = df.loc[~pd.isnull(df['dropoff_longitude'])]

In [45]:
#add pickup and dropoff "neighbourhood" name
import shapefile
import pyproj
from functools import partial
from shapely.geometry import shape, Point, mapping
from shapely.ops import transform
from multiprocessing import Pool, Process, Array, cpu_count


def load_shapes(filename=rawdir+"taxi_zones/taxi_zones"):
    sf = shapefile.Reader(filename)

    project = partial(
        pyproj.transform,
        pyproj.Proj(init='epsg:2263', preserve_units=True), # NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet
        pyproj.Proj(init='epsg:4326')) # wgs84 (lat/lng) 
 
    shapes = list(map(lambda x: transform(project, shape(x.__geo_interface__)), sf.shapes()))
    
    df = pd.DataFrame(sf.records())
    df = df.rename(columns={3:"name"})
    neighbourhood_json = list(map(lambda x: x.__geo_interface__, sf.shapes()))

    neighbourhood_json = {'type': 'FeatureCollection',
                'features': [{'type':'Feature', 'geometry':x, 'properties': {'name':df.iloc[i]['name']}} for i, x in enumerate(neighbourhood_json)]
              }

    return shapes, neighbourhood_json, df

def which_neighbourhood(shapes, names, chunk):
    output = []
    for _, row in chunk.iterrows():
        pickup_point = shape(Point(row['pickup_longitude'], row['pickup_latitude']))
        dropoff_point = shape(Point(row['dropoff_longitude'], row['dropoff_latitude']))

        pickup_name = ""
        dropoff_name = ""

        for i, neighbourhood in enumerate(shapes):
            if pickup_point.within(neighbourhood):
                pickup_name = names[i]
                cache[round_m(pickup_point)] = pickup_name
            if dropoff_point.within(neighbourhood):
                dropoff_name = names[i]
                cache[round_m(dropoff_point)] = dropoff_name
            if pickup_name and dropoff_name:
                break
        if not pickup_name:                
            #outside NYC
            pickup_name = "OutsideNYC"
        if not dropoff_name:
            dropoff_name = "OutsideNYC"
        output.append((pickup_name, dropoff_name))
    return output

shapes, geojson, namedf = load_shapes()

0              (Lenox Hill West, Upper West Side South)
1                              (OutsideNYC, OutsideNYC)
2                (Midtown Center, TriBeCa/Civic Center)
3                (Midtown North, Upper East Side North)
4          (West Chelsea/Hudson Yards, Lenox Hill West)
5      (Garment District, Penn Station/Madison Sq West)
6              (Times Sq/Theatre District, Murray Hill)
7              (Murray Hill, West Chelsea/Hudson Yards)
8           (Midtown Center, Times Sq/Theatre District)
9                   (Lenox Hill West, Garment District)
10    (Penn Station/Madison Sq West, Lincoln Square ...
11                (Upper West Side North, Midtown East)
12         (LaGuardia Airport, Greenwich Village North)
13                         (SoHo, TriBeCa/Civic Center)
14    (Greenwich Village North, Times Sq/Theatre Dis...
15                           (Union Sq, Midtown Center)
16                       (Midtown Center, Midtown East)
17           (Upper East Side North, East Harlem

In [46]:
from functools import partial

num_processes = cpu_count()

# calculate the chunk size as an integer
latlong_df = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']]

chunk_size = int(latlong_df.shape[0]/num_processes)

chunks = [latlong_df.iloc[i:i + chunk_size] for i in range(0, latlong_df.shape[0], chunk_size)]

wn_func = partial(which_neighbourhood, shapes, namedf["name"])
with Pool(num_processes) as p:
    results = p.map(wn_func, chunks)

In [51]:
#rejoin results
output = []
for result in results:
    output += result

In [52]:
df["pickup_neighbourhood"] = [x[0] for x in output]
df["dropoff_neighbourhood"] = [x[1] for x in output]

In [None]:
df["pickup_neighbourhood"].hist()

In [53]:
df.groupby("pickup_neighbourhood").count()

Unnamed: 0_level_0,medallion,hack_license,vendor_id_x,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,dropoff_latitude,vendor_id_y,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,dropoff_neighbourhood
pickup_neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Allerton/Pelham Gardens,51,51,51,51,29,51,51,51,51,51,...,51,51,51,51,51,51,51,51,51,51
Alphabet City,50787,50787,50787,50787,25418,50787,50787,50787,50787,50787,...,50787,50787,50787,50787,50787,50787,50787,50787,50787,50787
Arden Heights,171,171,171,171,169,171,171,171,171,171,...,171,171,171,171,171,171,171,171,171,171
Arrochar/Fort Wadsworth,18,18,18,18,13,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
Astoria,33509,33509,33509,33509,17209,33509,33509,33509,33509,33509,...,33509,33509,33509,33509,33509,33509,33509,33509,33509,33509
Astoria Park,271,271,271,271,139,271,271,271,271,271,...,271,271,271,271,271,271,271,271,271,271
Auburndale,37,37,37,37,27,37,37,37,37,37,...,37,37,37,37,37,37,37,37,37,37
Baisley Park,2252,2252,2252,2252,1041,2252,2252,2252,2252,2252,...,2252,2252,2252,2252,2252,2252,2252,2252,2252,2252
Bath Beach,84,84,84,84,38,84,84,84,84,84,...,84,84,84,84,84,84,84,84,84,84
Battery Park,4168,4168,4168,4168,1870,4168,4168,4168,4168,4168,...,4168,4168,4168,4168,4168,4168,4168,4168,4168,4168


In [None]:
#write out whole dataset
processeddir = "../data/processed/"
df.to_csv(os.path.join(processeddir,"nyctaxiclean.csv"))

#stratified sample; 20k per pickup neighbourhood
np.random.seed(100)
num_samples = 20000
sample_df = df.groupby('pickup_neighbourhood', group_keys=False).apply(lambda x: x.sample(min(len(x), num_samples)))
df.to_csv(os.path.join(processeddir,"nyctaxiclean_sample.csv"))