# Build a ML model mini project

In [1]:
!pip install -q pyarrow scikit-learn matplotlib pandas

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


In [3]:
import ssl, certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

In [4]:
# 1) Load January 2022 Yellow Taxi parquet
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet"
df = pd.read_parquet(url)  # requires pyarrow

# 2) First 5 rows
display(df.head())

# 3) Drop rows with NULLs
df = df.dropna()

# 4) Create 'trip_duration' (minutes)
df['trip_duration'] = (
    (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'])
    .dt.total_seconds() / 60.0
)

# 5) Target variable
target_variable = 'total_amount'

# 6) Feature columns
feature_cols = [
    'VendorID',
    'trip_distance',
    'payment_type',
    'PULocationID',
    'DOLocationID',
    'trip_duration'
]

# (Optional) Keep it manageable in memory for Colab by downsampling
if len(df) > 300_000:
    df = df.sample(n=300_000, random_state=42)

# Final sanity check
df = df.dropna(subset=feature_cols + [target_variable])
print(df[feature_cols + [target_variable]].dtypes)
print(df.shape)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


VendorID           int64
trip_distance    float64
payment_type       int64
PULocationID       int64
DOLocationID       int64
trip_duration    float64
total_amount     float64
dtype: object
(300000, 20)


In [5]:
X = df[feature_cols].copy()
y = df[target_variable].astype(float).copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=8
)
X_train.shape, X_test.shape


((240000, 6), (60000, 6))

In [6]:
y_pred_baseline = np.full_like(y_test, fill_value=y_train.mean(), dtype=float)
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
print(f"Baseline MAE (predict mean): {mae_baseline:,.2f}")


Baseline MAE (predict mean): 9.10


In [7]:
numeric_features = ['trip_distance', 'trip_duration']
categorical_features = ['VendorID', 'payment_type', 'PULocationID', 'DOLocationID']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ]
)

linreg_pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LinearRegression())
])

linreg_pipe.fit(X_train, y_train)
y_pred_lin = linreg_pipe.predict(X_test)
mae_lin = mean_absolute_error(y_test, y_pred_lin)
print(f"Linear Regression MAE: {mae_lin:,.2f} (Baseline: {mae_baseline:,.2f})")


Linear Regression MAE: 2.76 (Baseline: 9.10)


In [8]:
rf_pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        min_samples_split=2,
        n_jobs=-1,
        random_state=8
    ))
])

rf_pipe.fit(X_train, y_train)
y_pred_rf = rf_pipe.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest MAE: {mae_rf:,.2f} (Linear: {mae_lin:,.2f} | Baseline: {mae_baseline:,.2f})")


KeyboardInterrupt: 

In [None]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
}

rf_grid = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', RandomForestRegressor(n_jobs=-1, random_state=8))
])

grid = GridSearchCV(
    rf_grid,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
print("Best CV MAE:", -grid.best_score_)

best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
print(f"Tuned RF MAE: {mae_best:,.2f} (RF: {mae_rf:,.2f} | Linear: {mae_lin:,.2f} | Baseline: {mae_baseline:,.2f})")
