In [13]:
import os
# Change to the macrohive root directory
os.chdir('/Users/alighazizadeh/Documents/GitHub/mlops-zoomcamp/01-intro')
!pwd

/Users/alighazizadeh/Documents/GitHub/mlops-zoomcamp/01-intro


In [2]:
import numpy as np
import pandas as pd

import sklearn
import matplotlib
#from src.data.dataset import MotorImageryDataset


from datetime import datetime
import importlib

# Train linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

%matplotlib inline
import matplotlib.pyplot as plt
# from sklearn.metrics import r2_score
import torch.nn.functional as F
import math
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler

from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
# Fit DictVectorizer
from sklearn.feature_extraction import DictVectorizer


In [14]:
df=pd.read_parquet('data/yellow_tripdata_2023-01.parquet')
print(f'number of columns: {len(df.columns)}')


number of columns: 19


In [18]:

trip_duration = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60
df['duration'] = trip_duration


time_filter = (df.tpep_pickup_datetime < '2023-01-31') & (df.tpep_dropoff_datetime > '2023-01-01')

df_time_filter = df[time_filter]

print(f'number of rows before time filter: {len(df)}')
print(f'number of rows after time filter: {len(df_time_filter)}')

# Get dates outside the time filter
dates_outside_filter = df[~time_filter]['tpep_pickup_datetime'].dt.date.unique()
print("\nUnique dates outside the time filter:")
for date in sorted(dates_outside_filter):
    print(date)

trip_durtaion_std_jan=df_time_filter['duration'].std()
print(f'trip_durtaion_std_jan: {trip_durtaion_std_jan:.2f}')

number of rows before time filter: 3066766
number of rows after time filter: 2966359

Unique dates outside the time filter:
2008-12-31
2022-10-24
2022-10-25
2022-12-31
2023-01-31
2023-02-01
trip_durtaion_std_jan: 42.79


In [19]:
duration_filter = (df['duration'] >= 1) & (df['duration'] <= 60)
df_time_duration_filter = df[time_filter & duration_filter]


print(f'original rows: {len(df)}')
print(f'rows after time filter: {len(df_time_filter)}')
print(f'rows after time and duration filter: {len(df_time_duration_filter)}')
print(f'pct rows kept: {(len(df_time_duration_filter)/len(df_time_filter))*100:.2f}%')



original rows: 3066766
rows after time filter: 2966359
rows after time and duration filter: 2910919
pct rows kept: 98.13%


In [20]:
# Convert location IDs to strings and create list of dicts
dicts = df_time_duration_filter[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')
dv = DictVectorizer()
X = dv.fit_transform(dicts)
print(len(df_time_duration_filter))
print(f"number of columns of feature matrix: {X.shape[1]} should be equal to unique number of location ids {len(df_time_duration_filter['PULocationID'].unique())} + {len(df_time_duration_filter['DOLocationID'].unique())} ")


2910919
number of columns of feature matrix: 515 should be equal to unique number of location ids 255 + 260 


In [21]:
# Get target variable
y = df_time_duration_filter['duration'].values

# Initialize and train model
lr = LinearRegression()
lr.fit(X, y)

# Make predictions on training data
y_pred = lr.predict(X)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f'RMSE on training data: {rmse:.2f}')


RMSE on training data: 7.61


In [22]:
# Test on validation data from Feb 2023
df_test=pd.read_parquet('data/yellow_tripdata_2023-02.parquet')
time_test_filter = (df_test.tpep_pickup_datetime < '2023-02-28') & (df_test.tpep_dropoff_datetime > '2023-02-01')

trip_test_duration = (df_test.tpep_dropoff_datetime - df_test.tpep_pickup_datetime).dt.total_seconds() / 60
df_test['duration'] = trip_test_duration
duration_test_filter = (df_test['duration'] >= 1) & (df_test['duration'] <= 60)

df_test_time_filter = df_test[time_test_filter]
df_test_time_duration_filter = df_test[time_test_filter & duration_test_filter]


print(f'original rows: {len(df_test)}')
print(f'rows after time filter: {len(df_test_time_filter)}')
print(f'rows after time and duration filter: {len(df_test_time_duration_filter)}')
print(f'pct rows kept: {(len(df_test_time_duration_filter)/len(df_test_time_filter))*100:.2f}%')

# Get dates outside the time filter
dates_outside_filter = df_test[~time_test_filter]['tpep_pickup_datetime'].dt.date.unique()
print("\nUnique dates outside the time filter:")
for date in sorted(dates_outside_filter):
    print(date)

original rows: 2913955
rows after time filter: 2812777
rows after time and duration filter: 2756446
pct rows kept: 98.00%

Unique dates outside the time filter:
2008-12-31
2009-01-01
2023-01-31
2023-02-28
2023-03-01
2023-03-06
2023-03-07


In [23]:

# Get unique location IDs from training and test data
train_pu_locations = set(df_time_duration_filter['PULocationID'].unique())
train_do_locations = set(df_time_duration_filter['DOLocationID'].unique())
test_pu_locations = set(df_test_time_duration_filter['PULocationID'].unique())
test_do_locations = set(df_test_time_duration_filter['DOLocationID'].unique())

# Find intersections
pu_intersection = train_pu_locations.intersection(test_pu_locations)
do_intersection = train_do_locations.intersection(test_do_locations)

print(f"Number of PULocationID intersections: {len(pu_intersection)}")
print(f"Number of DOLocationID intersections: {len(do_intersection)}")
print(f" number of unique PULocationID in test data: {len(test_pu_locations)}")
print(f" number of unique DOLocationID in test data: {len(test_do_locations)}")

# Check if all test locations are in train locations
test_only_pu = test_pu_locations - train_pu_locations
test_only_do = test_do_locations - train_do_locations

print("\nPULocationIDs in test but not in train:", len(test_only_pu))
if len(test_only_pu) > 0:
    print("PULocationIDs:", sorted(test_only_pu))

print("\nDOLocationIDs in test but not in train:", len(test_only_do)) 
if len(test_only_do) > 0:
    print("DOLocationIDs:", sorted(test_only_do))


if len(test_only_pu) == 0 & len(test_only_do) == 0:
    print("All test locations are in train locations")
else:
    print("Some test locations are not in train locations")


Number of PULocationID intersections: 250
Number of DOLocationID intersections: 260
 number of unique PULocationID in test data: 253
 number of unique DOLocationID in test data: 261

PULocationIDs in test but not in train: 3
PULocationIDs: [59, 105, 204]

DOLocationIDs in test but not in train: 1
DOLocationIDs: [105]
Some test locations are not in train locations


In [24]:
# As precaution, let's use the same locations in test as in training
train_locations = set(df_time_duration_filter['PULocationID'].unique()) | set(df_time_duration_filter['DOLocationID'].unique())

# Filter test data to only include locations that were in training data
valid_locations_mask = (
    df_test_time_duration_filter['PULocationID'].isin(train_locations) & 
    df_test_time_duration_filter['DOLocationID'].isin(train_locations)
)

df_test_time_duration_ID_filter = df_test_time_duration_filter[valid_locations_mask].copy()

# Create a DictVectorizer with all possible combinations from training data
dv = DictVectorizer(sparse=True)

# Fit DictVectorizer on training data only
train_dicts = df_time_duration_filter[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')
X = dv.fit_transform(train_dicts)

# Transform test data using the fitted DictVectorizer
test_dicts = df_test_time_duration_ID_filter[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')
X_test = dv.transform(test_dicts)

print(f"Training features: {X.shape[1]}")
print(f"Test features: {X_test.shape[1]}")

# Retrain the model with the new features
lr = LinearRegression()
y = df_time_duration_filter['duration'].values
lr.fit(X, y)
y_pred = lr.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f'RMSE on training data: {rmse:.2f}')

# Now predict
y_test = df_test_time_duration_ID_filter['duration'].values
y_pred_test = lr.predict(X_test)

rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'RMSE on test data: {rmse_test:.2f}')

Training features: 515
Test features: 515
RMSE on training data: 7.61
RMSE on test data: 7.83
