In [1]:
!pip install pyarrow



In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [4]:
rides = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet")

In [5]:
rides.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0
5,2,2025-01-01 00:48:24,2025-01-01 01:08:26,2.0,2.63,1.0,N,239,68,2,19.1,1.0,0.5,0.0,0.0,1.0,24.1,2.5,0.0,0.0
6,1,2025-01-01 00:14:47,2025-01-01 00:16:15,0.0,0.4,1.0,N,170,170,1,4.4,3.5,0.5,2.35,0.0,1.0,11.75,2.5,0.0,0.0
7,1,2025-01-01 00:39:27,2025-01-01 00:51:51,0.0,1.6,1.0,N,234,148,1,12.1,3.5,0.5,2.0,0.0,1.0,19.1,2.5,0.0,0.0
8,1,2025-01-01 00:53:43,2025-01-01 01:13:23,0.0,2.8,1.0,N,148,170,1,19.1,3.5,0.5,3.0,0.0,1.0,27.1,2.5,0.0,0.0
9,2,2025-01-01 00:00:02,2025-01-01 00:09:36,1.0,1.71,1.0,N,237,262,2,11.4,1.0,0.5,0.0,0.0,1.0,16.4,2.5,0.0,0.0


In [6]:
# Calculate ride duration in minutes
rides['ride_duration'] = rides.tpep_dropoff_datetime - rides.tpep_pickup_datetime
rides['ride_duration'] = rides['ride_duration'].apply(lambda x: x.total_seconds() / 60)

In [7]:
# Filter rides between 1 and 60 minutes
rides_min = rides[(rides.ride_duration >= 1) & (rides.ride_duration <= 60)]
rides_min.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,ride_duration
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,...,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0,8.35
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,...,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0,2.55
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,...,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0,1.95
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,...,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0,5.566667
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,...,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0,3.533333


In [8]:
# Define feature columns
cat_features = ['PULocationID', 'DOLocationID']
num_features = ['trip_distance']

In [9]:
# Convert categorical columns to strings
rides[cat_features] = rides[cat_features].astype(str)

In [10]:
# Prepare training data
train_features = rides[cat_features + num_features].to_dict(orient='records')

In [11]:
# Vectorizer
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_features)

In [12]:
# Target
target_column = 'ride_duration'
y_train = rides[target_column].values

In [14]:
# Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_predicted = model.predict(X_train)

In [15]:
# RMSE Calculation
rmse = mean_squared_error(y_train, y_predicted, squared=False)
print(f'RMSE: {rmse}')

RMSE: 38.10992032931662


In [None]:
sns.distplot(y_predicted, label='prediction')
sns.distplot(y_train, label='actual')

plt.legend()

