## 1. Load up datasets for 3 months
I will be working with first quarter 2024 data. Would have preferred to load more but not sure my PC can handle the volume.

In [1]:
#load parquet file into dataframe
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import dask.dataframe as dd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error
import gc


Create function to ready require columns from parquet file

In [2]:
def process_parquet(parquet_file_path):
    columns_needed = ['request_datetime', 'on_scene_datetime', 'PULocationID', 'DOLocationID', 'trip_time']
    
    # Read the specific columns
    df = pd.read_parquet(parquet_file_path, columns=columns_needed)
    
    # Convert datetime columns
    df['request_datetime'] = pd.to_datetime(df['request_datetime'])
    df['on_scene_datetime'] = pd.to_datetime(df['on_scene_datetime'])
    
    # Calculate the wait_time in seconds
    df['wait_time'] = (df['on_scene_datetime'] - df['request_datetime']).dt.total_seconds()
    
    return df

In [3]:
#Jan 2024
trip_jan = process_parquet('data/fhvhv_tripdata_2024-01.parquet')
trip_jan.head()

Unnamed: 0,request_datetime,on_scene_datetime,PULocationID,DOLocationID,trip_time,wait_time
0,2024-01-01 00:21:47,2024-01-01 00:25:06,161,158,2251,199.0
1,2024-01-01 00:10:56,2024-01-01 00:11:08,137,79,432,12.0
2,2024-01-01 00:20:04,2024-01-01 00:21:51,79,186,731,107.0
3,2024-01-01 00:35:46,2024-01-01 00:39:59,234,148,930,253.0
4,2024-01-01 00:48:19,2024-01-01 00:56:23,148,97,761,484.0


In [4]:
#Feb 2024
trip_feb = process_parquet('data/fhvhv_tripdata_2024-02.parquet') 
trip_feb.head()

Unnamed: 0,request_datetime,on_scene_datetime,PULocationID,DOLocationID,trip_time,wait_time
0,2024-02-01 00:13:55,2024-02-01 00:19:59,149,210,420,364.0
1,2024-02-01 00:00:14,2024-02-01 00:03:28,127,136,300,194.0
2,2024-02-01 00:13:57,2024-02-01 00:15:44,127,69,848,107.0
3,2024-02-01 00:40:21,NaT,169,169,541,
4,2024-02-01 00:14:14,2024-02-01 00:15:55,142,152,658,101.0


In [5]:
#Mar 2024
trip_mar = process_parquet('data/fhvhv_tripdata_2024-03.parquet') 
trip_mar.head()

Unnamed: 0,request_datetime,on_scene_datetime,PULocationID,DOLocationID,trip_time,wait_time
0,2024-03-01 00:42:28,2024-03-01 00:45:18,148,144,170,170.0
1,2024-03-01 00:54:31,2024-03-01 00:55:55,79,114,291,84.0
2,2024-03-01 00:18:51,2024-03-01 00:20:27,232,148,425,96.0
3,2024-03-01 00:24:09,2024-03-01 00:30:33,148,233,766,384.0
4,2024-03-01 00:05:27,2024-03-01 00:07:42,48,237,1688,135.0


In [6]:
#merge the three dataframes, ensure the merge is optimised
trips = pd.concat([trip_jan, trip_feb, trip_mar], ignore_index=True)

trips.head()

Unnamed: 0,request_datetime,on_scene_datetime,PULocationID,DOLocationID,trip_time,wait_time
0,2024-01-01 00:21:47,2024-01-01 00:25:06,161,158,2251,199.0
1,2024-01-01 00:10:56,2024-01-01 00:11:08,137,79,432,12.0
2,2024-01-01 00:20:04,2024-01-01 00:21:51,79,186,731,107.0
3,2024-01-01 00:35:46,2024-01-01 00:39:59,234,148,930,253.0
4,2024-01-01 00:48:19,2024-01-01 00:56:23,148,97,761,484.0


In [7]:
#for memory sakes, I'm manually deleting the unneeded dataframes
del trip_jan
del trip_feb
del trip_mar
gc.collect()

0

## 2. Data Preprocessing
### 2.1. Check for nulls

In [8]:
#shape of dataframe
trips.shape

(60303866, 6)

In [9]:
#check for nulls
trips.isnull().sum()

request_datetime            0
on_scene_datetime    15797599
PULocationID                0
DOLocationID                0
trip_time                   0
wait_time            15797599
dtype: int64

In [10]:
#drop nulls as I have more than enough data
trips = trips.dropna()
trips.shape

(44506267, 6)

### 2.4. Grouping and Loading additional data
#### 2.4.1. Peak, off-peak and night time

In [11]:
""" 
    Group request datetime into Peak, Off-Peak and Night
    Night: 10pm to 6am
    Off-Peak: Weekdays, 10am to 3pm and 7pm to 10pm. Weekends, 6am to 10pm
    Peak: Weekdays, 6am to 10am and 3pm to 7pm
"""

# Extract hour and day of week
hours = trips['request_datetime'].dt.hour
day_of_week = trips['request_datetime'].dt.weekday

# Initialize the default group as Night (0)
trips['request_time_group'] = 0

# Define conditions for Peak (2) and OffPeak (1) times for weekday
weekday_mask = day_of_week < 5

peak_mask = (hours >= 6) & (hours < 10) | (hours >= 15) & (hours < 19)
off_peak_mask = (hours >= 10) & (hours < 15) | (hours >= 19) & (hours < 22)

# For weekend
weekend_mask = ~weekday_mask

# Apply conditions for weekdays
trips.loc[weekday_mask & peak_mask, 'request_time_group'] = 2
trips.loc[weekday_mask & off_peak_mask, 'request_time_group'] = 1

# Apply conditions for weekends
trips.loc[weekend_mask & (hours >= 6) & (hours < 22), 'request_time_group'] = 1

trips.head()

Unnamed: 0,request_datetime,on_scene_datetime,PULocationID,DOLocationID,trip_time,wait_time,request_time_group
0,2024-01-01 00:21:47,2024-01-01 00:25:06,161,158,2251,199.0,0
1,2024-01-01 00:10:56,2024-01-01 00:11:08,137,79,432,12.0,0
2,2024-01-01 00:20:04,2024-01-01 00:21:51,79,186,731,107.0,0
3,2024-01-01 00:35:46,2024-01-01 00:39:59,234,148,930,253.0,0
4,2024-01-01 00:48:19,2024-01-01 00:56:23,148,97,761,484.0,0


#### 2.4.2. Holidays in NYC

In [12]:
#Holidays
from datetime import date
import holidays

ny_holidays = holidays.country_holidays('US', subdiv='NY', years=2024)

#load ny_holidays into dataframe, date and holiday name
ny_holidays_df = pd.DataFrame.from_dict(ny_holidays, orient='index')
ny_holidays_df

Unnamed: 0,0
2024-01-01,New Year's Day
2024-05-27,Memorial Day
2024-06-19,Juneteenth National Independence Day
2024-07-04,Independence Day
2024-09-02,Labor Day
2024-11-11,Veterans Day
2024-11-28,Thanksgiving
2024-12-25,Christmas Day
2024-01-15,Martin Luther King Jr. Day
2024-02-19,Washington's Birthday


In [13]:
#reset index and rename columns
ny_holidays_df.reset_index(inplace=True)
ny_holidays_df.columns = ['date', 'holiday_name']
ny_holidays_df

Unnamed: 0,date,holiday_name
0,2024-01-01,New Year's Day
1,2024-05-27,Memorial Day
2,2024-06-19,Juneteenth National Independence Day
3,2024-07-04,Independence Day
4,2024-09-02,Labor Day
5,2024-11-11,Veterans Day
6,2024-11-28,Thanksgiving
7,2024-12-25,Christmas Day
8,2024-01-15,Martin Luther King Jr. Day
9,2024-02-19,Washington's Birthday


In [14]:
#convert holidays to datetime and create is_holiday flag in the trips dataframe
ny_holidays_df.index = pd.to_datetime(ny_holidays_df.index)
trips['is_holiday'] = trips['request_datetime'].dt.date.isin(ny_holidays_df['date']).astype(int)
trips.head()

Unnamed: 0,request_datetime,on_scene_datetime,PULocationID,DOLocationID,trip_time,wait_time,request_time_group,is_holiday
0,2024-01-01 00:21:47,2024-01-01 00:25:06,161,158,2251,199.0,0,1
1,2024-01-01 00:10:56,2024-01-01 00:11:08,137,79,432,12.0,0,1
2,2024-01-01 00:20:04,2024-01-01 00:21:51,79,186,731,107.0,0,1
3,2024-01-01 00:35:46,2024-01-01 00:39:59,234,148,930,253.0,0,1
4,2024-01-01 00:48:19,2024-01-01 00:56:23,148,97,761,484.0,0,1


#### 2.4.3. NYC weather

In [15]:
#Get Weather data
from datetime import datetime 
from meteostat import Point, Monthly, Daily, Hourly, Normals, units  
# Set time period 
start = datetime(2024, 1, 1) 
end = datetime(2024, 3, 31)  
# Create Point for NY
location = Point(40.712775, -74.005973)  
# Get daily data 
weather_data = Hourly(location, start, end) 
weather_data = weather_data.convert(units.imperial) 
weather_data = weather_data.fetch()

weather_data.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-01-01 00:00:00,42.8,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
2024-01-01 01:00:00,42.1,27.9,57.0,0.0,,260.0,7.0,,1016.4,,3.0
2024-01-01 02:00:00,42.1,28.8,59.0,0.0,,260.0,5.8,,1016.4,,3.0
2024-01-01 03:00:00,42.1,30.0,62.0,0.0,,250.0,5.8,,1016.4,,3.0
2024-01-01 04:00:00,42.1,30.7,64.0,0.0,,260.0,5.8,,1016.5,,3.0


In [16]:
#reset index and convert time to datetime
weather_data.reset_index(inplace=True)
weather_data.rename(columns={'index': 'time'}, inplace=True)
weather_data['time'] = pd.to_datetime(weather_data['time'])

weather_data.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,2024-01-01 00:00:00,42.8,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
1,2024-01-01 01:00:00,42.1,27.9,57.0,0.0,,260.0,7.0,,1016.4,,3.0
2,2024-01-01 02:00:00,42.1,28.8,59.0,0.0,,260.0,5.8,,1016.4,,3.0
3,2024-01-01 03:00:00,42.1,30.0,62.0,0.0,,250.0,5.8,,1016.4,,3.0
4,2024-01-01 04:00:00,42.1,30.7,64.0,0.0,,260.0,5.8,,1016.5,,3.0


In [18]:
trips['rounded_request_datetime'] = trips['request_datetime'].dt.round('H')

In [20]:
#merge trips and weather dataframe on trips[request_datetime] and weather[time]
trips_weather_df = trips.merge(weather_data, left_on='rounded_request_datetime', right_on='time', how='left')
trips_weather_df.head()

Unnamed: 0,request_datetime,on_scene_datetime,PULocationID,DOLocationID,trip_time,wait_time,request_time_group,is_holiday,rounded_request_datetime,time,...,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,2024-01-01 00:21:47,2024-01-01 00:25:06,161,158,2251,199.0,0,1,2024-01-01 00:00:00,2024-01-01 00:00:00,...,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
1,2024-01-01 00:10:56,2024-01-01 00:11:08,137,79,432,12.0,0,1,2024-01-01 00:00:00,2024-01-01 00:00:00,...,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
2,2024-01-01 00:20:04,2024-01-01 00:21:51,79,186,731,107.0,0,1,2024-01-01 00:00:00,2024-01-01 00:00:00,...,28.6,57.0,0.0,,260.0,6.8,,1017.0,,3.0
3,2024-01-01 00:35:46,2024-01-01 00:39:59,234,148,930,253.0,0,1,2024-01-01 01:00:00,2024-01-01 01:00:00,...,27.9,57.0,0.0,,260.0,7.0,,1016.4,,3.0
4,2024-01-01 00:48:19,2024-01-01 00:56:23,148,97,761,484.0,0,1,2024-01-01 01:00:00,2024-01-01 01:00:00,...,27.9,57.0,0.0,,260.0,7.0,,1016.4,,3.0
