## 1. Load up datasets for 3 months
I will be working with first quarter 20224 data. Would have preferred to load more but not sure my PC can handle the volume.

In [4]:
#load parquet file into dataframe
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import dask.dataframe as dd

""" 
Here are some benefits of continuing to work with dask DataFrames:
* Faster computation: dask can parallelize computations across multiple cores, making it faster than pandas for large datasets.
* Less memory usage: dask can handle larger-than-memory datasets, so you don't need to worry about running out of RAM.
* More flexible: dask provides more advanced features for working with large datasets, such as parallelizing computations and handling out-of-core data.
"""
#Jan 2024
fhvhv_jan = dd.read_parquet('data/fhvhv_tripdata_2024-01.parquet')

fhvhv_jan.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,4.05,2.75,0.0,0.0,40.18,N,N,N,N,N
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.89,2.75,0.0,0.0,6.12,N,N,N,N,N
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,1.6,2.75,0.0,0.0,9.47,N,N,N,N,N
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,1.52,2.75,0.0,0.0,11.35,N,N,N,N,N
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,3.43,2.75,0.0,0.0,28.63,N,N,N,N,N


In [5]:
#Feb 2024
# fhvhv_feb = dd.read_parquet('data/fhvhv_tripdata_2024-02.parquet') 

# fhvhv_feb.head()

In [6]:
#Mar 2024
# fhvhv_mar = dd.read_parquet('data/fhvhv_tripdata_2024-03.parquet') 

# fhvhv_mar.head()

In [7]:
#merge the three dataframes, ensure the merge is optimised to run faster
# fhvhv = dd.concat([fhvhv_jan, fhvhv_feb, fhvhv_mar], axis=0)
# fhvhv.head(10)

fhvhv = fhvhv_jan

## 2. Load external data
Zone, Weather and Public Holidays 

In [8]:
#Zone

#load zones into dataframe
zones = pd.read_csv('data/taxi_zone_lookup.csv')
zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [9]:
#create PUZoneName and DOZoneName column in fhvhv dataframe. PUZoneName is the zone name of the PULocationID, DOZoneName is the zone name of the DOLocationID
fhvhv['PUZoneName'] = fhvhv['PULocationID'].map(zones.set_index('LocationID')['Zone'], meta=('PUZoneName', object))
fhvhv['DOZoneName'] = fhvhv['DOLocationID'].map(zones.set_index('LocationID')['Zone'], meta=('DOZoneName', object))

fhvhv.head()


Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PUZoneName,DOZoneName
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,0.0,0.0,40.18,N,N,N,N,N,Midtown Center,Meatpacking/West Village West
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.0,0.0,6.12,N,N,N,N,N,Kips Bay,East Village
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,0.0,0.0,9.47,N,N,N,N,N,East Village,Penn Station/Madison Sq West
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,0.0,0.0,11.35,N,N,N,N,N,Union Sq,Lower East Side
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,0.0,0.0,28.63,N,N,N,N,N,Lower East Side,Fort Greene


In [10]:
#Convert the request_datetime to datetime
#fhvhv['request_datetime'] = pd.to_datetime(fhvhv['request_datetime'])

#Function to group the request_datetime as Peak:2, OffPeak:1 or Night:0
def group_request_time(fhvhv_row):
    day_of_week = fhvhv_row['request_datetime'].weekday()  # Monday=0, Sunday=6
    hour = fhvhv_row['request_datetime'].hour
    night, off_peak, peak = 0, 1, 2

    if hour >= 23 or hour < 6:
        return night
    elif day_of_week < 5:  # Monday to Friday
        if (6 <= hour < 10) or (15 <= hour < 19):
            return peak
        elif (10 <= hour < 15) or (19 <= hour < 23):
            return off_peak
    else:  # Saturday and Sunday
        if hour >= 6 and hour < 23:
            return off_peak
        else:
            return night

#apply function to dataframe
fhvhv['request_time_group'] = fhvhv.apply(group_request_time, axis=1, meta=('request_time_group', object))

fhvhv.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PUZoneName,DOZoneName,request_time_group
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,0.0,40.18,N,N,N,N,N,Midtown Center,Meatpacking/West Village West,0
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,0.0,6.12,N,N,N,N,N,Kips Bay,East Village,0
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,0.0,9.47,N,N,N,N,N,East Village,Penn Station/Madison Sq West,0
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,0.0,11.35,N,N,N,N,N,Union Sq,Lower East Side,0
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,0.0,28.63,N,N,N,N,N,Lower East Side,Fort Greene,0


In [11]:
#Holidays
from datetime import date
import holidays

ny_holidays = holidays.country_holidays('US', subdiv='NY', years=2024)

ny_holidays

#load ny_holidays into dataframe, date and holiday name

ny_holidays_df = pd.DataFrame.from_dict(ny_holidays, orient='index')

ny_holidays_df

Unnamed: 0,0
2024-01-01,New Year's Day
2024-05-27,Memorial Day
2024-06-19,Juneteenth National Independence Day
2024-07-04,Independence Day
2024-09-02,Labor Day
2024-11-11,Veterans Day
2024-11-28,Thanksgiving
2024-12-25,Christmas Day
2024-01-15,Martin Luther King Jr. Day
2024-02-19,Washington's Birthday


In [13]:
#convert holidays to datetime
ny_holidays_df.index = pd.to_datetime(ny_holidays_df.index)

fhvhv['is_holiday'] = fhvhv['request_datetime'].dt.date.isin(ny_holidays_df.index).astype(int)

fhvhv.head(20)

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PUZoneName,DOZoneName,request_time_group,is_holiday
0,HV0003,B03404,B03404,2024-01-01 00:21:47,2024-01-01 00:25:06,2024-01-01 00:28:08,2024-01-01 01:05:39,161,158,2.83,...,40.18,N,N,N,N,N,Midtown Center,Meatpacking/West Village West,0,0
1,HV0003,B03404,B03404,2024-01-01 00:10:56,2024-01-01 00:11:08,2024-01-01 00:12:53,2024-01-01 00:20:05,137,79,1.57,...,6.12,N,N,N,N,N,Kips Bay,East Village,0,0
2,HV0003,B03404,B03404,2024-01-01 00:20:04,2024-01-01 00:21:51,2024-01-01 00:23:05,2024-01-01 00:35:16,79,186,1.98,...,9.47,N,N,N,N,N,East Village,Penn Station/Madison Sq West,0,0
3,HV0003,B03404,B03404,2024-01-01 00:35:46,2024-01-01 00:39:59,2024-01-01 00:41:04,2024-01-01 00:56:34,234,148,1.99,...,11.35,N,N,N,N,N,Union Sq,Lower East Side,0,0
4,HV0003,B03404,B03404,2024-01-01 00:48:19,2024-01-01 00:56:23,2024-01-01 00:57:21,2024-01-01 01:10:02,148,97,2.65,...,28.63,N,N,N,N,N,Lower East Side,Fort Greene,0,0
5,HV0003,B03404,B03404,2024-01-01 00:03:47,2024-01-01 00:05:53,2024-01-01 00:06:15,2024-01-01 00:27:53,255,95,7.02,...,24.35,N,N,N,N,Y,Williamsburg (North Side),Forest Hills,0,0
6,HV0003,B03404,B03404,2024-01-01 00:22:51,2024-01-01 00:29:17,2024-01-01 00:29:47,2024-01-01 00:50:08,95,212,11.33,...,30.98,N,N,N,N,Y,Forest Hills,Soundview/Bruckner,0,0
7,HV0003,B03404,B03404,2024-01-01 00:45:34,2024-01-01 00:57:29,2024-01-01 00:57:50,2024-01-01 01:11:27,213,47,3.43,...,20.73,N,N,N,N,Y,Soundview/Castle Hill,Claremont/Bathgate,0,0
8,HV0003,B03404,B03404,2024-01-01 00:11:51,2024-01-01 00:15:46,2024-01-01 00:16:00,2024-01-01 00:28:13,209,114,1.54,...,10.4,N,N,N,N,Y,Seaport,Greenwich Village South,0,0
9,HV0003,B03404,B03404,2024-01-01 00:26:48,2024-01-01 00:33:02,2024-01-01 00:33:15,2024-01-01 00:46:39,113,209,1.72,...,11.38,N,N,N,N,Y,Greenwich Village North,Seaport,0,0


In [14]:
#Weather
from datetime import datetime 
from meteostat import Point, Monthly, Daily, Hourly, Normals, units  
# Set time period 
start = datetime(2024, 1, 1) 
end = datetime(2024, 3, 31)  
# Create Point for NY
location = Point(40.712775, -74.005973)  
# Get daily data 
data = Hourly(location, start, end) 
data = data.convert(units.imperial) 
data = data.fetch()

data

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-01 00:00:00,42.8,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
2024-01-01 01:00:00,42.1,27.9,57.0,0.0,,260.0,7.0,,1016.4,,3.0
2024-01-01 02:00:00,42.1,28.8,59.0,0.0,,260.0,5.8,,1016.4,,3.0
2024-01-01 03:00:00,42.1,30.0,62.0,0.0,,250.0,5.8,,1016.4,,3.0
2024-01-01 04:00:00,42.1,30.7,64.0,0.0,,260.0,5.8,,1016.5,,3.0
...,...,...,...,...,...,...,...,...,...,...,...
2024-03-30 20:00:00,60.8,25.7,26.0,0.0,,260.0,12.4,,1007.0,,3.0
2024-03-30 21:00:00,59.0,23.2,25.0,0.0,,270.0,16.2,,1006.7,,3.0
2024-03-30 22:00:00,57.9,26.6,30.0,0.0,,270.0,15.0,,1007.0,,3.0
2024-03-30 23:00:00,55.4,30.2,38.0,0.0,,270.0,16.2,,1008.0,,3.0
