## 1. Load up datasets for 3 months
I will be working with first quarter 2024 data. Would have preferred to load more but not sure my PC can handle the volume.

In [4]:
#load parquet file into dataframe
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import dask.dataframe as dd

""" 
data from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
Using dask DataFrames instead since it is faster, uses less memory and more flexible in handling large datasets.
"""
#Jan 2024
trip_jan = dd.read_parquet('data/fhvhv_tripdata_2024-01.parquet')
trip_jan.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,4.05,2.75,0.0,0.0,40.18,N,N,N,N,N
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.89,2.75,0.0,0.0,6.12,N,N,N,N,N
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,1.6,2.75,0.0,0.0,9.47,N,N,N,N,N
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,1.52,2.75,0.0,0.0,11.35,N,N,N,N,N
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,3.43,2.75,0.0,0.0,28.63,N,N,N,N,N


In [5]:
#Feb 2024
# trip_feb = dd.read_parquet('data/fhvhv_tripdata_2024-02.parquet') 
# trip_feb.head()

In [6]:
#Mar 2024
# trip_mar = dd.read_parquet('data/fhvhv_tripdata_2024-03.parquet') 
# trip_mar.head()

In [7]:
#merge the three dataframes, ensure the merge is optimised to run faster
# trips = dd.concat([trip_jan, trip_feb, trip_mar], axis=0)

trips = trip_jan
trips.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,4.05,2.75,0.0,0.0,40.18,N,N,N,N,N
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.89,2.75,0.0,0.0,6.12,N,N,N,N,N
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,1.6,2.75,0.0,0.0,9.47,N,N,N,N,N
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,1.52,2.75,0.0,0.0,11.35,N,N,N,N,N
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,3.43,2.75,0.0,0.0,28.63,N,N,N,N,N


## 2. Load external data
### i. Taxi Zone 

In [8]:
#Taxi Zone

""" 
    load taxi zones into dataframe
    data from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
"""
zones = pd.read_csv('data/taxi_zone_lookup.csv')
zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [9]:
""" 
    create PUZoneName and DOZoneName column in fhvhv dataframe. 
    PUZoneName is the zone name of the PULocationID, 
    DOZoneName is the zone name of the DOLocationID
"""
trips['PUZoneName'] = trips['PULocationID'].map(zones.set_index('LocationID')['Zone'], meta=('PUZoneName', object))
trips['DOZoneName'] = trips['DOLocationID'].map(zones.set_index('LocationID')['Zone'], meta=('DOZoneName', object))

trips.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PUZoneName,DOZoneName
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,0.0,0.0,40.18,N,N,N,N,N,Midtown Center,Meatpacking/West Village West
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.0,0.0,6.12,N,N,N,N,N,Kips Bay,East Village
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,0.0,0.0,9.47,N,N,N,N,N,East Village,Penn Station/Madison Sq West
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,0.0,0.0,11.35,N,N,N,N,N,Union Sq,Lower East Side
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,0.0,0.0,28.63,N,N,N,N,N,Lower East Side,Fort Greene


### ii. Peak, Off-Peak and Night

In [10]:
""" 
    Group request datetime into Peak, Off-Peak and Night
"""

#Function to group the request_datetime as Peak:2, OffPeak:1 or Night:0
def group_request_time(fhvhv_row):
    day_of_week = fhvhv_row['request_datetime'].weekday()  # Monday=0, Sunday=6
    hour = fhvhv_row['request_datetime'].hour
    night, off_peak, peak = 0, 1, 2

    if hour >= 23 or hour < 6:
        return night
    elif day_of_week < 5:  # Monday to Friday
        if (6 <= hour < 10) or (15 <= hour < 19):
            return peak
        elif (10 <= hour < 15) or (19 <= hour < 23):
            return off_peak
    else:  # Saturday and Sunday
        if hour >= 6 and hour < 23:
            return off_peak
        else:
            return night

#apply function to dataframe
trips['request_time_group'] = trips.apply(group_request_time, axis=1, meta=('request_time_group', object))

trips.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PUZoneName,DOZoneName,request_time_group
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,0.0,40.18,N,N,N,N,N,Midtown Center,Meatpacking/West Village West,0
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.0,6.12,N,N,N,N,N,Kips Bay,East Village,0
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,0.0,9.47,N,N,N,N,N,East Village,Penn Station/Madison Sq West,0
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,0.0,11.35,N,N,N,N,N,Union Sq,Lower East Side,0
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,0.0,28.63,N,N,N,N,N,Lower East Side,Fort Greene,0


### iii. Holidays in NYC

In [11]:
#Holidays
from datetime import date
import holidays

ny_holidays = holidays.country_holidays('US', subdiv='NY', years=2024)

#load ny_holidays into dataframe, date and holiday name
ny_holidays_df = pd.DataFrame.from_dict(ny_holidays, orient='index')
ny_holidays_df

Unnamed: 0,0
2024-01-01,New Year's Day
2024-05-27,Memorial Day
2024-06-19,Juneteenth National Independence Day
2024-07-04,Independence Day
2024-09-02,Labor Day
2024-11-11,Veterans Day
2024-11-28,Thanksgiving
2024-12-25,Christmas Day
2024-01-15,Martin Luther King Jr. Day
2024-02-19,Washington's Birthday


In [12]:
#reset index and rename columns
ny_holidays_df.reset_index(inplace=True)
ny_holidays_df.columns = ['date', 'holiday_name']
ny_holidays_df

Unnamed: 0,date,holiday_name
0,2024-01-01,New Year's Day
1,2024-05-27,Memorial Day
2,2024-06-19,Juneteenth National Independence Day
3,2024-07-04,Independence Day
4,2024-09-02,Labor Day
5,2024-11-11,Veterans Day
6,2024-11-28,Thanksgiving
7,2024-12-25,Christmas Day
8,2024-01-15,Martin Luther King Jr. Day
9,2024-02-19,Washington's Birthday


In [13]:
#convert holidays to datetime and create is_holiday flag in the trips dataframe
ny_holidays_df.index = pd.to_datetime(ny_holidays_df.index)
trips['is_holiday'] = trips['request_datetime'].dt.date.isin(ny_holidays_df['date']).astype(int)
trips.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PUZoneName,DOZoneName,request_time_group,is_holiday
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,40.18,N,N,N,N,N,Midtown Center,Meatpacking/West Village West,0,1
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,6.12,N,N,N,N,N,Kips Bay,East Village,0,1
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,9.47,N,N,N,N,N,East Village,Penn Station/Madison Sq West,0,1
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,11.35,N,N,N,N,N,Union Sq,Lower East Side,0,1
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,28.63,N,N,N,N,N,Lower East Side,Fort Greene,0,1


### iv. NYC weather

In [14]:
#Get Weather data
from datetime import datetime 
from meteostat import Point, Monthly, Daily, Hourly, Normals, units  
# Set time period 
start = datetime(2024, 1, 1) 
end = datetime(2024, 3, 31)  
# Create Point for NY
location = Point(40.712775, -74.005973)  
# Get daily data 
weather_data = Hourly(location, start, end) 
weather_data = weather_data.convert(units.imperial) 
weather_data = weather_data.fetch()

weather_data.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-01 00:00:00,42.8,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
2024-01-01 01:00:00,42.1,27.9,57.0,0.0,,260.0,7.0,,1016.4,,3.0
2024-01-01 02:00:00,42.1,28.8,59.0,0.0,,260.0,5.8,,1016.4,,3.0
2024-01-01 03:00:00,42.1,30.0,62.0,0.0,,250.0,5.8,,1016.4,,3.0
2024-01-01 04:00:00,42.1,30.7,64.0,0.0,,260.0,5.8,,1016.5,,3.0


In [15]:
#reset index and convert time to datetime
weather_data.reset_index(inplace=True)
weather_data.rename(columns={'index': 'time'}, inplace=True)
weather_data['time'] = pd.to_datetime(weather_data['time'])

weather_data.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,2024-01-01 00:00:00,42.8,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
1,2024-01-01 01:00:00,42.1,27.9,57.0,0.0,,260.0,7.0,,1016.4,,3.0
2,2024-01-01 02:00:00,42.1,28.8,59.0,0.0,,260.0,5.8,,1016.4,,3.0
3,2024-01-01 03:00:00,42.1,30.0,62.0,0.0,,250.0,5.8,,1016.4,,3.0
4,2024-01-01 04:00:00,42.1,30.7,64.0,0.0,,260.0,5.8,,1016.5,,3.0


In [16]:
#Select the required columns from weather data and merge with trips dataframe
weather_data = weather_data[['time', 'temp', 'coco']]

trip_weather_df = trips.merge(weather_data, left_on='request_datetime', right_on='time').compute()

#drop time column
trip_weather_df.drop('time', axis=1, inplace=True)

trip_weather_df.head()

+------------------------------+----------------+----------------+
| Merge columns                | left dtype     | right dtype    |
+------------------------------+----------------+----------------+
| ('request_datetime', 'time') | datetime64[us] | datetime64[ns] |
+------------------------------+----------------+----------------+
Cast dtypes explicitly to avoid unexpected results.


Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PUZoneName,DOZoneName,request_time_group,is_holiday,temp,coco
0,HV0003,B03404,B03404,2024-01-01 01:00:00,2024-01-01 00:41:55,2024-01-01 00:47:33,2024-01-01 01:18:28,107,151,5.73,...,N,N,N,N,Gramercy,Manhattan Valley,0,1,42.1,3.0
1,HV0003,B03404,B03404,2024-01-01 01:00:00,2024-01-01 00:47:44,2024-01-01 00:59:08,2024-01-01 01:24:01,7,258,10.56,...,N,N,N,N,Astoria,Woodhaven,0,1,42.1,3.0
2,HV0003,B03404,B03404,2024-01-01 01:00:00,2024-01-01 00:50:53,2024-01-01 00:51:55,2024-01-01 01:23:44,145,265,17.59,...,N,N,N,N,Long Island City/Hunters Point,Outside of NYC,0,1,42.1,3.0
3,HV0003,B03404,B03404,2024-01-01 01:00:00,2024-01-01 00:53:55,2024-01-01 00:54:43,2024-01-01 01:35:37,170,10,14.71,...,N,N,N,N,Murray Hill,Baisley Park,0,1,42.1,3.0
4,HV0003,B03404,B03404,2024-01-01 01:00:00,2024-01-01 00:58:12,2024-01-01 00:58:53,2024-01-01 01:45:24,89,238,15.49,...,N,N,N,N,Flatbush/Ditmas Park,Upper West Side North,0,1,42.1,3.0
