# Project Deliverable 3 - Group 33
#### Group Members: Bethany Findlay, Charlotte Albert, Kaykay Akpama, Kosi Udechukwu

## Notebook Set-Up

In [12]:
#Import necessary libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn import metrics
from sklearn import inspection
from sklearn.inspection import PartialDependenceDisplay
from sklearn.metrics import PredictionErrorDisplay
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [13]:
#Read cleaned data from past project deliverables
df=pd.read_csv("project_deliverable_1_cleaned.csv")

In [14]:
# Same feature engineering from deliverable 2
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['pickup_time_fractional'] = df['pickup_hour'] + df['pickup_minute'] / 60

day_mapping = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2,
    'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6
}
df['pickup_day_num'] = df['pickup_day'].map(day_mapping)

## 1. Full Pipeline Construction

In [15]:
# Defining the feature set, using the expanded feature set from Deliverable 2

numeric_features = [
    'trip_distance_km',
    'pickup_day_num',
    'pickup_hour',
    'pickup_minute',
    'pickup_time_fractional',
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude'
]

target = 'trip_duration'

In [16]:
# preprocessing with ColumnTransformer
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

StandardScaler puts all numeric features on a similar scale, helping models train more effectively. The ColumnTransformer is used so the preprocessing steps apply only to the selected numeric columns. This keeps the workflow clear and ensures the same steps are used during both training and testing.

In [17]:
# building 2 model pipelines (KNN Regression & Linear Regression)
linreg_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

knn_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", KNeighborsRegressor(n_neighbors=10))
])

The two model families selected for comparison are Linear Regression and KNN Regression, similarly to deliverable 2. Linear Regression provides a simple baseline model that is fast and easy to interpret. KNN Regression captures more complex patterns by looking at nearby datapoints.

In [18]:
# train/test split
X = df[numeric_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [19]:
# fit and evaluate with RMSE, MAE, R^2
def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    return mae, rmse, r2

linreg_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)

linreg_metrics = evaluate(linreg_model, X_test, y_test)
knn_metrics = evaluate(knn_model, X_test, y_test)

The evaluation function uses MAE, RMSE, and R^2 because these three metrics give an overall view of the model performance for a regression task. MAE shows the average size of errors, RMSE penalizes larger mistakes more strongly, and R^2 shows how much the variation in trip duration the model can explain. Multiple metrics helps to compare multiple models against one another.

In [20]:
# print results for comparison
print("Linear Regression Results:")
print(f"MAE: {linreg_metrics[0]:.3f} secs")
print(f"RMSE: {linreg_metrics[1]:.3f} secs")
print(f"R^2: {linreg_metrics[2]:.3f}\n")

print("KNN Regression Results:")
print(f"MAE: {knn_metrics[0]:.3f} secs")
print(f"RMSE: {knn_metrics[1]:.3f} secs")
print(f"R^2: {knn_metrics[2]:.3f}")

Linear Regression Results:
MAE: 276.784 secs
RMSE: 407.309 secs
R^2: 0.604

KNN Regression Results:
MAE: 199.656 secs
RMSE: 316.123 secs
R^2: 0.761


## 2. Systematic Hyperparameter Tuning

## 3. Final Model Selection and Evaluation

## 4. Integration of TA Feedback

## 5. Final Reflection

## 6. Individual Contributions

## BONUS: Kaggle Submission