In [1]:
import numpy as np
import pandas as pd

In [3]:
# read in taxi data frame
dfTaxi = pd.read_csv("2018_Yellow_Taxi_Trip_Data.csv", nrows=1_000, header=0)

# initialize column names
column_names = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
                'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount',
                'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']

# reread dataframe to reinitialize columns
dfTaxi = pd.read_csv('2018_Yellow_Taxi_Trip_Data.csv', header=None, names=column_names, skiprows=1)

In [4]:
dfTaxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,03/31/2018 03:45:57 PM,03/31/2018 03:50:56 PM,0.0,0.7,1.0,N,239.0,239.0,2.0,5.5,0.0,0.5,0.0,0.0,0.3,6.3
1,1,03/31/2018 03:53:58 PM,03/31/2018 03:56:36 PM,0.0,0.2,1.0,N,239.0,239.0,2.0,3.5,0.0,0.5,0.0,0.0,0.3,4.3
2,1,03/31/2018 03:59:56 PM,03/31/2018 04:08:31 PM,0.0,1.7,1.0,N,239.0,141.0,2.0,8.0,0.0,0.5,0.0,0.0,0.3,8.8
3,2,03/31/2018 03:05:51 PM,03/31/2018 03:29:28 PM,1.0,4.63,1.0,N,229.0,249.0,1.0,19.5,0.0,0.5,4.06,0.0,0.3,24.36
4,2,03/31/2018 03:06:04 PM,03/31/2018 03:17:37 PM,1.0,0.98,1.0,N,100.0,246.0,1.0,8.5,0.0,0.5,1.86,0.0,0.3,11.16


In [5]:
# read in event data
dfEvent = pd.read_csv("NYC_Permitted_Event_Information_-_Historical.csv", nrows=1_000)
dfEvent.head()

Unnamed: 0,Event ID,Event Name,Start Date/Time,End Date/Time,Event Agency,Event Type,Event Borough,Event Location,Event Street Side,Street Closure Type,Community Board,Police Precinct
0,368421,Big Apple Circus,11/18/2017 07:00:00 PM,11/18/2017 08:00:00 PM,Parks Department,Special Event,Manhattan,"Damrosch Park: Damrosch Park ,Damrosch Park: T...",,,7,20
1,330050,Mt. Eden Farmer's Market,11/16/2017 08:00:00 AM,11/16/2017 04:00:00 PM,Parks Department,Special Event,Bronx,Mount Eden Malls: Mount Eden Malls,,,4,44
2,314111,Columbia Greenmarket Thursday,11/21/2017 08:00:00 AM,11/21/2017 05:00:00 PM,Street Activity Permit Office,Farmers Market,Manhattan,BROADWAY between WEST 114 STREET and WEST 1...,East,Sidewalk and Curb Lane Closure,9,26
3,369850,Lawn Maintenance,11/23/2017 12:00:00 AM,11/23/2017 11:58:00 PM,Parks Department,Construction,Manhattan,Madison Square Park: Center Lawn,,,5,13
4,335783,"October, November December model aircraft flying",11/22/2017 09:00:00 AM,11/22/2017 08:00:00 PM,Parks Department,Special Event,Staten Island,LaTourette Park & Golf Course: Model Airplane ...,,,2,122


In [39]:
dfEvent.columns

Index(['Event ID', 'Event Name', 'Start Date/Time', 'End Date/Time',
       'Event Agency', 'Event Type', 'Event Borough', 'Event Location',
       'Event Street Side', 'Street Closure Type', 'Community Board',
       'Police Precinct', 'LocationList', 'start_time', 'end_time',
       'event_time_15min'],
      dtype='object')

In [29]:
dfTaxi.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount'],
      dtype='object')

In [21]:
# Read event data into pandas and extract unique list of locations
# Get unique event locations
unique_event_locations = dfEvent['Event Location'].unique()

# Create a simple mapping from event location strings to readable IDs
event_location_to_taxi_map = {
    event_loc: f"location_{i}" for i, event_loc in enumerate(unique_event_locations)
}

In [25]:
# Feature Engineering 
# convert event time and other features to numeric
dfEvent['start_time'] = pd.to_datetime(dfEvent['Start Date/Time'], format="%m/%d/%Y %I:%M:%S %p")
dfEvent['end_time'] = pd.to_datetime(dfEvent['End Date/Time'], format="%m/%d/%Y %I:%M:%S %p")

dfPre = dfEvent.copy()
dfPre['event_time'] = dfPre['start_time'] - pd.Timedelta(minutes=15)

dfPost = dfEvent.copy()
dfPost['event_time'] = dfPost['end_time']
dfPost['demand_type'] = 'post_event'

dfCombined = pd.concat([dfPre, dfPost], ignore_index=True)

dfCombined['hour'] = dfCombined['event_time'].dt.hour
dfCombined['day_of_week'] = dfCombined['event_time'].dt.dayofweek

dfCombined['mapped_location'] = dfCombined['Event Location'].map(event_location_to_taxi_map)

In [82]:
dfTaxi['pickup_time'] = pd.to_datetime(dfTaxi['tpep_pickup_datetime'], format="%m/%d/%Y %I:%M:%S %p")
dfTaxi['pickup_time_15min'] = dfTaxi['pickup_time'].dt.floor('15min')
dfEvent = dfEvent[dfEvent['Event Location'] != '']

# count trips per location and 15-minute interval
trip_counter = dfTaxi.groupby(['PULocationID', 'pickup_time_15min']).size().reset_index(name='num_trips')

# merge trip counts with event data based on location and time
dfEvent['start_time_15min'] = dfEvent['start_time'].dt.floor('15min')
dfEvent['end_time_15min'] = dfEvent['end_time'].dt.floor('15min')

unique_locations = dfEvent['Event Location'].unique()
event_location_to_taxi_map = {loc: f'location_{i}' for i, loc in enumerate(unique_locations)}

# Convert 'Event Location' to numeric again
dfEvent['mapped_location'] = pd.to_numeric(dfEvent['Event Location'], errors='coerce')
print (dfEvent['mapped_location'])

trip_counter['PULocationID'] = pd.to_numeric(trip_counter['PULocationID'], errors='coerce')
print(trip_counter)
# Merge for before-event rides 
dfStartMerged = pd.merge(
    dfEvent,
    trip_counter,
    left_on=['mapped_location', 'start_time_15min'],
    right_on=['PULocationID', 'pickup_time_15min'],
    how='left'
)

# Merge for after-event rides
dfEndMerged = pd.merge(
    dfStartMerged,
    trip_counter,
    left_on=['mapped_location', 'end_time_15min'],
    right_on=['PULocationID', 'pickup_time_15min'],
    how='left'
)

# Drop redundant columns from right-side merge if needed
dfMerged = dfEndMerged.drop(columns=['PULocationID_y', 'pickup_time_15min_y'])
dfMerged.rename(columns={'PULocationID_x': 'PULocationID', 'pickup_time_15min_x': 'pickup_time_15min'}, inplace=True)

print(dfEvent['mapped_location'])

print(dfEvent['Event Location'].dropna().unique()[:20])

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
995   NaN
996   NaN
997   NaN
998   NaN
999   NaN
Name: mapped_location, Length: 1000, dtype: float64
      PULocationID   pickup_time_15min  num_trips
0              1.0 2018-03-31 16:15:00          1
1              1.0 2018-03-31 18:30:00          1
2              1.0 2018-03-31 19:00:00          2
3              3.0 2018-03-31 16:00:00          1
4              3.0 2018-04-01 02:45:00          1
...            ...                 ...        ...
5686         265.0 2018-04-01 03:15:00          1
5687         265.0 2018-04-01 03:45:00          1
5688         265.0 2018-04-01 04:00:00          2
5689         265.0 2018-04-01 04:15:00          2
5690         265.0 2018-04-01 04:30:00          2

[5691 rows x 3 columns]
0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
995   NaN
996   NaN
997   NaN
998   NaN
999   NaN
Name: mapped_location, Length: 1000, dtype: float64
['Damrosch Park: Damrosch Park ,Damrosch Park: Tent

In [83]:
# Extract hour and day of week from the datetime column
dfMerged['hour'] = dfMerged['start_time_15min'].dt.hour
dfMerged['day_of_week'] = dfMerged['start_time_15min'].dt.dayofweek

X = dfMerged[['hour', 'day_of_week', 'mapped_location']]
X = pd.get_dummies(X, columns=['mapped_location'])  # Convert categorical locations

y = dfMerged['num_trips_x']


print(y)

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
995   NaN
996   NaN
997   NaN
998   NaN
999   NaN
Name: num_trips_x, Length: 1000, dtype: float64


In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [53]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize and train the model 
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

ValueError: Input y contains NaN.

In [None]:
# evaluate model
mse = mean_squared_error(y_test, y_pred)
print(mse)