Baseline Model

In [54]:
import numpy as np
import pandas as pd
import mlflow

In [55]:
# read in taxi data frame
dfTaxi = pd.read_csv("2018_Yellow_Taxi_Trip_Data.csv", nrows=60_000, header=0)

# initialize column names
column_names = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
                'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount',
                'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']

# reread dataframe to reinitialize columns
dfTaxi = pd.read_csv('2018_Yellow_Taxi_Trip_Data.csv', header=None, names=column_names, skiprows=1)

In [56]:
dfTaxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,03/31/2018 03:45:57 PM,03/31/2018 03:50:56 PM,0,0.7,1.0,N,239.0,239.0,2.0,5.5,0.0,0.5,0.0,0.0,0.3,6.3
1,1,03/31/2018 03:53:58 PM,03/31/2018 03:56:36 PM,0,0.2,1.0,N,239.0,239.0,2.0,3.5,0.0,0.5,0.0,0.0,0.3,4.3
2,1,03/31/2018 03:59:56 PM,03/31/2018 04:08:31 PM,0,1.7,1.0,N,239.0,141.0,2.0,8.0,0.0,0.5,0.0,0.0,0.3,8.8
3,2,03/31/2018 03:05:51 PM,03/31/2018 03:29:28 PM,1,4.63,1.0,N,229.0,249.0,1.0,19.5,0.0,0.5,4.06,0.0,0.3,24.36
4,2,03/31/2018 03:06:04 PM,03/31/2018 03:17:37 PM,1,0.98,1.0,N,100.0,246.0,1.0,8.5,0.0,0.5,1.86,0.0,0.3,11.16


In [57]:
# read in event data
dfEvent = pd.read_csv("NYC_Permitted_Event_Information_-_Historical.csv", nrows=10_000)
dfEvent.head()

Unnamed: 0,Event ID,Event Name,Start Date/Time,End Date/Time,Event Agency,Event Type,Event Borough,Event Location,Event Street Side,Street Closure Type,Community Board,Police Precinct
0,368421,Big Apple Circus,11/18/2017 07:00:00 PM,11/18/2017 08:00:00 PM,Parks Department,Special Event,Manhattan,"Damrosch Park: Damrosch Park ,Damrosch Park: T...",,,7,20
1,330050,Mt. Eden Farmer's Market,11/16/2017 08:00:00 AM,11/16/2017 04:00:00 PM,Parks Department,Special Event,Bronx,Mount Eden Malls: Mount Eden Malls,,,4,44
2,314111,Columbia Greenmarket Thursday,11/21/2017 08:00:00 AM,11/21/2017 05:00:00 PM,Street Activity Permit Office,Farmers Market,Manhattan,BROADWAY between WEST 114 STREET and WEST 1...,East,Sidewalk and Curb Lane Closure,9,26
3,369850,Lawn Maintenance,11/23/2017 12:00:00 AM,11/23/2017 11:58:00 PM,Parks Department,Construction,Manhattan,Madison Square Park: Center Lawn,,,5,13
4,335783,"October, November December model aircraft flying",11/22/2017 09:00:00 AM,11/22/2017 08:00:00 PM,Parks Department,Special Event,Staten Island,LaTourette Park & Golf Course: Model Airplane ...,,,2,122


In [58]:
dfEvent.columns

Index(['Event ID', 'Event Name', 'Start Date/Time', 'End Date/Time',
       'Event Agency', 'Event Type', 'Event Borough', 'Event Location',
       'Event Street Side', 'Street Closure Type', 'Community Board',
       'Police Precinct'],
      dtype='object')

In [59]:
dfTaxi.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount'],
      dtype='object')

In [None]:
#Convert to datetime
dfTaxi['tpep_pickup_datetime'] = pd.to_datetime(dfTaxi['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')
dfEvent['Start Date/Time'] = pd.to_datetime(dfEvent['Start Date/Time'], format='%m/%d/%Y %I:%M:%S %p')
dfEvent['End Date/Time'] = pd.to_datetime(dfEvent['End Date/Time'], format='%m/%d/%Y %I:%M:%S %p')

# Initialize empty list to collect matches
matches = []

# For each event, find taxi pickups during its time window
for _, event in dfEvent.iterrows():
    start = event['Start Date/Time']
    end = event['End Date/Time']
    location = event['Event Location']
    
    # Filter pickups during the event
    mask = (dfTaxi['tpep_pickup_datetime'] >= start) & (dfTaxi['tpep_pickup_datetime'] <= end)
    pickups_during_event = dfTaxi.loc[mask, ['PULocationID']].copy()
    pickups_during_event['event_location'] = location
    
    matches.append(pickups_during_event)

# Concatenate all matched pickups
dfMatched = pd.concat(matches)

# Group by PULocationID and pick the most common event location
pu_to_event_dict = dfMatched.groupby('PULocationID')['event_location'].agg(lambda x: x.mode().iloc[0]).to_dict()
print(pu_to_event_dict)

In [None]:
dfTaxi['pickup_bin'] = dfTaxi['tpep_pickup_datetime'].dt.floor('15min')
trip_counts = dfTaxi.groupby('pickup_bin').size().reset_index(name='num_trips')

print(trip_counts)
print(trip_counts.columns)

In [48]:
# Preprocess event data
dfEvent['time_bin'] = dfEvent['Start Date/Time'].dt.floor('15min')
dfEvent['event_duration'] = (dfEvent['End Date/Time'] - dfEvent['Start Date/Time']).dt.total_seconds() / 60

# Expand events to cover all 15-minute they span
event_bins = []
for _, row in dfEvent.iterrows():
    bins = pd.date_range(start=row['Start Date/Time'], end=row['End Date/Time'], freq='15min')
    for b in bins:
        event_bins.append(b)
event_bins = pd.Series(event_bins).drop_duplicates()

# associate the bins with events
event_bins_df = pd.DataFrame({'pickup_bin': event_bins, 'has_event': 1})

# Merge with trip_counts on the pickup_bin
trip_counts = trip_counts.merge(event_bins_df, on='pickup_bin', how='left', suffixes=('', '_event'))

# After merge, the 'has_event' column may have NaNs if no event existed at that time — fill those with 0
trip_counts['has_event'] = trip_counts['has_event'].fillna(0).astype(int)  # Fill NaNs with 0
trip_counts.drop(columns=[col for col in ['has_event_trip'] if col in trip_counts.columns], inplace=True)
trip_counts = trip_counts.loc[:, ~trip_counts.columns.str.contains('has_event_event')]

# Feature engineering
trip_counts['hour'] = trip_counts['pickup_bin'].dt.hour
trip_counts['day_of_week'] = trip_counts['pickup_bin'].dt.dayofweek
trip_counts['is_weekend'] = trip_counts['day_of_week'].isin([5, 6]).astype(int)


# View the final trip_counts dataframe
print(trip_counts)


             pickup_bin  num_trips  has_event  hour  day_of_week  is_weekend
0   2008-12-31 13:00:00          1          0    13            2           0
1   2018-03-03 15:15:00          1          1    15            5           1
2   2018-03-03 16:30:00          1          1    16            5           1
3   2018-03-03 19:00:00          1          1    19            5           1
4   2018-03-03 20:30:00          1          1    20            5           1
..                  ...        ...        ...   ...          ...         ...
175 2018-04-01 17:00:00         38          0    17            6           1
176 2018-04-01 17:45:00          1          0    17            6           1
177 2018-04-01 20:15:00          1          0    20            6           1
178 2018-04-01 23:30:00          1          0    23            6           1
179 2018-04-01 23:45:00          2          0    23            6           1

[180 rows x 6 columns]


In [49]:
X = trip_counts[['has_event', 'hour', 'day_of_week', 'is_weekend']]  
y = trip_counts['num_trips']

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from mlflow.models.signature import infer_signature

In [23]:
import ray
#import torch
from ray.train import ScalingConfig, RunConfig
#from ray.train.lightning import TorchTrainer, RayDDPStrategy, RayLightningEnvironment
#from pytorch_lightning import Trainer, LightningModule

In [24]:
#mlflow.set_tracking_uri("http://129.114.25.171:8000/") 

In [25]:
# initialize ray
#ray.init(ignore_reinit_error=True)

In [53]:
X = X.astype('float32')
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

bins_train = trip_counts.loc[X_train.index, 'pickup_bin']
bins_test = trip_counts.loc[X_test.index, 'pickup_bin']

mlflow.set_experiment("taxi_trip_prediction")

with mlflow.start_run():
    n_estimators = 100
    random_state = 42

    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("random_state", random_state)

    model=RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("mean_squared_error", mean_squared_error(y_test, y_pred))

    input_sig = infer_signature(X_train, y_train)
    mlflow.sklearn.log_model(model, "random_forest_model", signature=input_sig)

    output_df = pd.DataFrame({
        'pickup_bin': bins_test.values, 
        'predicted_num_trips': y_pred
    }).sort_values('pickup_bin')

print(output_df)

KeyboardInterrupt: 

In [49]:
output_df

Results for n_estimators=50
            pickup_bin  predicted_num_trips  n_estimators  r2_score
24 2018-03-03 23:30:00                 1.00            50  0.975042
5  2018-03-04 02:15:00                 1.02            50  0.975042
15 2018-03-04 02:30:00                 1.02            50  0.975042
27 2018-03-04 04:15:00                 1.02            50  0.975042
0  2018-03-04 04:30:00                 1.02            50  0.975042 

Results for n_estimators=87
            pickup_bin  predicted_num_trips  n_estimators  r2_score
24 2018-03-03 23:30:00             1.000000            87  0.975471
5  2018-03-04 02:15:00             1.011494            87  0.975471
15 2018-03-04 02:30:00             1.011494            87  0.975471
27 2018-03-04 04:15:00             1.011494            87  0.975471
0  2018-03-04 04:30:00             1.011494            87  0.975471 

Results for n_estimators=125
            pickup_bin  predicted_num_trips  n_estimators  r2_score
24 2018-03-03 23:30:00     

In [50]:
import pickle

# Save the trained model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [52]:
mlflow.end_run()