In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('nytaxi2022.csv')
df.head()

  df = pd.read_csv('nytaxi2022.csv')


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,01/01/2022 12:35:40 AM,01/01/2022 12:53:29 AM,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,01/01/2022 12:33:43 AM,01/01/2022 12:42:07 AM,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,01/01/2022 12:53:21 AM,01/01/2022 01:02:19 AM,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,01/01/2022 12:25:21 AM,01/01/2022 12:35:23 AM,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,01/01/2022 12:36:48 AM,01/01/2022 01:14:20 AM,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [3]:
df.shape

(39656098, 19)

In [4]:
import pandas as pd

# convert "" or whitespace-only cells to NA so pandas counts them as missing
df = df.replace(r"^\s*$", pd.NA, regex=True)

The code above just turns empty or whitespace-only strings into proper missing values. 

* As CSV files often store "empty" cells as "" or spaces. By default, pandas won't treat " " as missing. Converting them to pd.NA makes your missingness checks and imputations work correctly. 

In [5]:
# number of missing row within a columns
df.isna().sum()            # per-column counts

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count          1368303
trip_distance                  0
RatecodeID               1368303
store_and_fwd_flag       1368303
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge     1368303
airport_fee              1368303
dtype: int64

In [6]:
N = len(df)
cols = ["passenger_count","RatecodeID","store_and_fwd_flag","congestion_surcharge","airport_fee"]

miss_tbl = (
    df[cols].isna().sum()
      .to_frame("missing")
      .assign(pct=lambda t: (t["missing"]/N*100).round(2))
)
miss_tbl  # quick table

# Are the *same* rows missing all five?
co_missing = df[cols].isna().all(axis=1)
co_missing.sum(), (co_missing.mean()*100).round(2)  # count & %

(np.int64(1368303), np.float64(3.45))

The code computes how much data is missing in a specific set of columns, then checks if it is the same rows missing across all of them. Firstly, it builds `miss_tbl`: for each column in ["passenger_count", "RatecodeID", "store_and_fwd_flag", "congestion_surcharge", "airport_fee"], it counts missing values and adds a percentage of total rows (N). Next, co_missing = df[cols].isna().all(axis=1) flags rows where all five of those columns are missing; co_missing.sum() gives the number of such rows and `co_missing.mean()*100` gives their percentage. 
This quickly shows both per-column gaps and whether there's a shared "co-missing" pattern-useful for deciding whether to impute together, drop those rows, or treat them specifically.

Output confirms that the same 1,368,303 (~3.45%) are missing values in these columns: passenger_count, RatecodeID, store_and_fwd_flag, congestion_surcharge, airport_fee. Since co_missing.sum() is 1,368,303 and that matches the per-column missing counts you showed earlier, it means the same 1,368,303 rows are missing values in every one of those columns (≈3.45% of the dataset). There are no rows where only a subset of those columns is missing.

In [7]:
TARGET = "total_amount"

allowed = ["tpep_pickup_datetime","tpep_dropoff_datetime",
    "passenger_count","trip_distance","RatecodeID",
    "PULocationID","DOLocationID","payment_type","extra"]

leakage = ["fare_amount","mta_tax","tip_amount","tolls_amount",
    "improvement_surcharge","congestion_surcharge","airport_fee", TARGET]

present = [c for c in df.columns if c in allowed]
leak_present = [c for c in df.columns if c in leakage and c != TARGET]

# optional: remove near-constant and very-missing among the allowed set
missing = df[present].isna().mean().sort_values(ascending=False)
const_like = [c for c in present if df[c].nunique(dropna=True) <= 1]
very_missing = [c for c in present if missing[c] > 0.40] #40% threshold; tweak

keep = [c for c in present if c not in const_like and c not in very_missing]

print("Keep: ", keep)
print("DROP (leakage): ", leak_present)
print("DROP (constant): ", const_like)
print("DROP (very missing): ", very_missing)

Keep:  ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type', 'extra']
DROP (leakage):  ['fare_amount', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee']
DROP (constant):  []
DROP (very missing):  []


Builds a *clean feature list* and tells you what to keep vs drop, with guards against target leakage and junk columns. 

* TARGET = your label;
- allowed = features you are allowed to use;
- leakage = columns that directly compost `total_amount` (and must not be features)

* present = intersection of allowed with the columns actually in df.
* leak_present = any leakage columns that exist in df (to confirm whether the column is excluded)

It then scores the present features:

missing = percent missing per column,

const_like = columns with ≤1 distinct non-NA value (no signal),

very_missing = features with >40% missing (threshold you can tweak).

keep = present minus (const_like ∪ very_missing). That’s your final whitelist.

In [None]:
import numpy as np
m = df[keep + [TARGET]].copy()
for c in ["tpep_pickup_datetime","tpep_dropoff_datetime"]:
    m[c] = pd.to_datetime(m[c], errors="coerce", utc=True)

m["trip_duration_min"] = (m["tpep_dropoff_datetime"] - m["tpep_pickup_datetime"]).dt.total_seconds()/60
m["pickup_hour"] = m["tpep_pickup_datetime"].dt.hour.astype("Int8")
m["pickup_dow"]  = m["tpep_pickup_datetime"].dt.dayofweek.astype("Int8")
m["pickup_month"]  = m["tpep_pickup_datetime"].dt.month.astype("Int8")


  m[c] = pd.to_datetime(m[c], errors="coerce", utc=True)
  m[c] = pd.to_datetime(m[c], errors="coerce", utc=True)


In [9]:
print("df: ", df.shape)

df:  (39656098, 19)


In [10]:
print("m. ", m.shape)

m.  (39656098, 13)


# Sanity + fix dtype

In [11]:
print("df:", df.shape)                 # should be ~ (39656098, 19)
print("m:",  m.shape)                  # should be (same rows, ~13 cols)

# If m shows 0 rows here, your df got filtered earlier — re-read the CSV
# df = pd.read_csv("nytaxi2022.csv", low_memory=False)

# Make sure numerics are truly numeric (bad strings -> NaN)
for c in ["passenger_count","trip_distance","extra","total_amount"]:
    m[c] = pd.to_numeric(m[c], errors="coerce")

# Keep engineered ints as floats so sklearn sees NaN correctly
m["trip_duration_min"] = m["trip_duration_min"].astype("float32")
m["pickup_hour"]       = m["pickup_hour"].astype("float32")
m["pickup_dow"]        = m["pickup_dow"].astype("float32")

# Cast categoricals
for c in ["RatecodeID","PULocationID","DOLocationID","payment_type"]:
    m[c] = m[c].astype("category")

df: (39656098, 19)
m: (39656098, 13)


In [23]:
m.to_parquet("data/nytaxi2022_cleaned.parquet", index=False)

In [22]:
m.to_csv("data/nytaxi2022_cleaned.csv", index=False)

# random train/test split and keep all learned preprocessing inside a pipeline

In [None]:
from sklearn.model_selection import train_test_split

NUM = ["passenger_count","trip_distance","extra","trip_duration_min","pickup_hour","pickup_dow"]
CAT = ["RatecodeID","PULocationID","DOLocationID","payment_type"]
TARGET = "total_amount"

X, y = m[NUM + CAT], m[TARGET].astype("float32")
print("final X,y:", X.shape, y.shape)   # must show non-zero rows

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.30, random_state=42)

final X,y: (39656098, 10) (39656098,)


In [13]:
len(Xtr), len(Xte), len(ytr), len(yte)
# expected ≈ (27,759,268, 11,896,830, 27,759,268, 11,896,830)

(27759268, 11896830, 27759268, 11896830)

In [18]:
Xtr.isna().sum()

passenger_count      958331
trip_distance             0
extra                     0
trip_duration_min         0
pickup_hour               0
pickup_dow                0
RatecodeID           958331
PULocationID              0
DOLocationID              0
payment_type              0
dtype: int64

### Baseline Ridge-Regression 

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# quick knobs for speed; set to None to use all rows
N_TRAIN = 1_000_000
N_TEST  = 500_000

# preprocessing (fit on train only inside pipeline)
pre_ridge = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc",  MaxAbsScaler())          # keeps sparse matrix fast
    ]), NUM),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh",  OneHotEncoder(handle_unknown="ignore", min_frequency=5))
    ]), CAT),
], sparse_threshold=1.0)

ridge = Pipeline([
    ("pre", pre_ridge),
    ("est", Ridge(alpha=1.0, solver="sag", max_iter=2000, random_state=42))
])

# sample for faster iteration
idx_tr = Xtr.sample(n=N_TRAIN, random_state=42).index if N_TRAIN else Xtr.index
idx_te = Xte.sample(n=N_TEST,  random_state=42).index if N_TEST  else Xte.index

ridge.fit(Xtr.loc[idx_tr], ytr.loc[idx_tr])
pred = ridge.predict(Xte.loc[idx_te])

mae = mean_absolute_error(yte.loc[idx_te], pred)
try:
    rmse = mean_squared_error(yte.loc[idx_te], pred, squared=False)  # newer sklearn
except TypeError:
    rmse = np.sqrt(mean_squared_error(yte.loc[idx_te], pred))        # older sklearn
r2  = r2_score(yte.loc[idx_te], pred)
print(f"[Ridge] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.4f}")


[Ridge] MAE=5.872  RMSE=16.987  R2=0.4314


### Baseline Random Forest

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# reuse N_TRAIN / N_TEST from above

pre_tree = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), NUM),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh",  OneHotEncoder(handle_unknown="ignore", min_frequency=20))  # collapse rares a bit more
    ]), CAT),
])

rf = Pipeline([
    ("pre", pre_tree),
    ("est", RandomForestRegressor(
        n_estimators=200,
        max_depth=16,
        min_samples_leaf=10,
        max_features=0.5,
        n_jobs=-1,
        random_state=42,
        verbose=1
    ))
])

idx_tr = Xtr.sample(n=N_TRAIN, random_state=42).index if N_TRAIN else Xtr.index
idx_te = Xte.sample(n=N_TEST,  random_state=42).index if N_TEST  else Xte.index

rf.fit(Xtr.loc[idx_tr], ytr.loc[idx_tr])
pred = rf.predict(Xte.loc[idx_te])

mae = mean_absolute_error(yte.loc[idx_te], pred)
try:
    rmse = mean_squared_error(yte.loc[idx_te], pred, squared=False)
except TypeError:
    rmse = np.sqrt(mean_squared_error(yte.loc[idx_te], pred))
r2  = r2_score(yte.loc[idx_te], pred)
print(f"[RandomForest] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.4f}")


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 10.8min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.7s finished


[RandomForest] MAE=1.740  RMSE=14.881  R2=0.5637


In [16]:
# --- Step 1: grow trees (keep other params fixed) ---
from time import perf_counter
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Assumes you already have: Xtr, Xte, ytr, yte, NUM, CAT

# Build preprocessor if not already defined
if 'pre_tree' not in globals():
    pre_tree = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), NUM),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh",  OneHotEncoder(handle_unknown="ignore", min_frequency=20))
        ]), CAT),
    ])

# Use a sample for speed while testing (adjust or set to full later)
N_TRAIN = 1_000_000
N_TEST  = 500_000
if 'idx_tr' not in globals():
    idx_tr = Xtr.sample(n=min(N_TRAIN, len(Xtr)), random_state=42).index
if 'idx_te' not in globals():
    idx_te = Xte.sample(n=min(N_TEST, len(Xte)), random_state=42).index

def _rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

def try_n(n_estimators):
    model = Pipeline([
        ("pre", pre_tree),
        ("est", RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=16,
            min_samples_leaf=10,
            max_features=0.5,
            bootstrap=True, oob_score=True,
            n_jobs=-1, random_state=42, verbose=0
        ))
    ])
    t0 = perf_counter(); model.fit(Xtr.loc[idx_tr], ytr.loc[idx_tr]); fit_s = perf_counter() - t0
    pred = model.predict(Xte.loc[idx_te])
    oob = model.named_steps["est"].oob_score_
    r2  = r2_score(yte.loc[idx_te], pred)
    mae = mean_absolute_error(yte.loc[idx_te], pred)
    rmse = _rmse(yte.loc[idx_te], pred)
    print(f"n={n_estimators:>4} | OOB R2={oob:.4f} | Test R2={r2:.4f} | MAE={mae:.3f} | RMSE={rmse:.3f} | fit={fit_s:.1f}s")
    return {"n_estimators": n_estimators, "oob_r2": oob, "test_r2": r2,
            "test_mae": mae, "test_rmse": rmse, "fit_s": round(fit_s,2), "model": model}

results = [try_n(n) for n in (200, 400, 800)]

# Keep the best by Test R²
best = max(results, key=lambda d: d["test_r2"])
best_n = best["n_estimators"]; best_model = best["model"]
print(f"\nBest so far: n={best_n} | Test R2={best['test_r2']:.4f} | OOB R2={best['oob_r2']:.4f}")


n= 200 | OOB R2=0.7778 | Test R2=0.5637 | MAE=1.740 | RMSE=14.881 | fit=722.0s
n= 400 | OOB R2=0.7778 | Test R2=0.5637 | MAE=1.738 | RMSE=14.880 | fit=1333.1s
n= 800 | OOB R2=0.7781 | Test R2=0.5638 | MAE=1.737 | RMSE=14.878 | fit=2834.6s

Best so far: n=800 | Test R2=0.5638 | OOB R2=0.7781


Changing the number of tree (n_estimators) and keep everything else fixed to see if more trees improve accuracy (OOB/Test R^2) and by how much.

Test R² barely changes from 200→800 trees (0.5637 → 0.5638), while fit time ~4×.

OOB R² is also flat (~0.778).

➡️ Already variance-limited; adding trees won’t help much. keep 200 (for speed) or 400 (final) and tune leaf size / depth / max_features next.

In [17]:
# uses: pre_tree, Xtr, Xte, ytr, yte
from time import perf_counter
import numpy as np, pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# sample for speed (reuse your idx_tr/idx_te if already defined)
N_TRAIN, N_TEST = 1_000_000, 500_000
if 'idx_tr' not in globals(): idx_tr = Xtr.sample(n=min(N_TRAIN, len(Xtr)), random_state=42).index
if 'idx_te' not in globals(): idx_te = Xte.sample(n=min(N_TEST, len(Xte)),  random_state=42).index

def _rmse(a,b):
    try: from sklearn.metrics import mean_squared_error as mse; return mse(a,b,squared=False)
    except TypeError: from sklearn.metrics import mean_squared_error as mse; import numpy as np; return np.sqrt(mse(a,b))

def try_cfg(max_depth, min_leaf, max_feats, n_estimators=400):
    model = Pipeline([
        ("pre", pre_tree),
        ("est", RandomForestRegressor(
            n_estimators=n_estimators, max_depth=max_depth,
            min_samples_leaf=min_leaf, max_features=max_feats,
            bootstrap=True, oob_score=True, n_jobs=-1, random_state=42, verbose=0
        ))
    ])
    t0=perf_counter(); model.fit(Xtr.loc[idx_tr], ytr.loc[idx_tr]); t=perf_counter()-t0
    pred = model.predict(Xte.loc[idx_te])
    return {
        "max_depth":max_depth, "min_leaf":min_leaf, "max_feats":max_feats,
        "oob_r2":model.named_steps["est"].oob_score_,
        "test_r2":r2_score(yte.loc[idx_te], pred),
        "mae":mean_absolute_error(yte.loc[idx_te], pred),
        "rmse":_rmse(yte.loc[idx_te], pred),
        "fit_s":round(t,1), "model":model
    }

cands = [(16,10,0.5),(16,5,0.5),(20,10,0.5),(20,5,0.5),(16,10,0.8),(20,5,0.8)]
rows = [try_cfg(*p) for p in cands]
pd.DataFrame([{k:v for k,v in r.items() if k!="model"} for r in rows]).sort_values("test_r2", ascending=False)
best = max(rows, key=lambda r: r["test_r2"])
best["max_depth"], best["min_leaf"], best["max_feats"], best["test_r2"], best["oob_r2"]


(20, 5, 0.5, 0.5652632225083358, 0.7792799752766375)

The code above is a tiny hyperparameter sweep for Random Forest that keep preprocessing inside the pipeline (no leakage), uses a big but manageable sample for speed, and reports both OOB (bagging's internal estimate) and held-out Test metrics to choose the best bias/variance trade-off before training anything huge

`max_depth` => deeper trees fit more detail (lower bias, higher variance (slower))
`min_sample_leaf` => Smaller leaves allow purer leaves (lower bias, higher variance)
`max_features` => more features considered per split (trees are more correlated and complex) -> lower bias, higher variance (and slower). Smaller values de-correlate trees and can improve generalization

`test_r2` => r^2 on your held-out test slice. Good standard - optimize for this
`oob_r^2` => out-of-bag computed R^2 computed from the training bootstrap samples. It's a fast, internal estimate; good for trend checking, but the test R^2 win when results disagree 

`fit_s` => wall-clock training time. Use it as a tiebreaker-prefer the fastest config within a hair of the best R^2

In [None]:
# pip install -U lightgbm   # macOS once: brew install libomp
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

NUM = ["passenger_count","trip_distance","extra","trip_duration_min","pickup_hour","pickup_dow"]
CAT = ["RatecodeID","PULocationID","DOLocationID","payment_type"]
FEATS = NUM + CAT

# Dtypes (very important for LGBM)
Xtr = Xtr.copy(); Xte = Xte.copy()
for c in CAT:
    Xtr[c] = Xtr[c].astype("category")
    Xte[c] = Xte[c].astype("category")
for c in NUM:
    Xtr[c] = pd.to_numeric(Xtr[c], errors="coerce")
    Xte[c] = pd.to_numeric(Xte[c], errors="coerce")

# Build train/valid from TRAIN ONLY
N_TRAIN, N_VALID = 1_000_000, 300_000
chunk = Xtr.sample(n=min(N_TRAIN+N_VALID, len(Xtr)), random_state=42)
idx_tr = chunk.sample(n=min(N_TRAIN, len(chunk)), random_state=42).index
idx_va = chunk.drop(idx_tr).index

train_ds = lgb.Dataset(Xtr.loc[idx_tr, FEATS], label=ytr.loc[idx_tr], categorical_feature=CAT, free_raw_data=False)
valid_ds = lgb.Dataset(Xtr.loc[idx_va, FEATS], label=ytr.loc[idx_va], categorical_feature=CAT, free_raw_data=False)

params = dict(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    num_leaves=64,
    feature_fraction=0.8,   # colsample_bytree
    bagging_fraction=0.8,   # subsample
    bagging_freq=1,
    min_data_in_leaf=100,
    verbose=-1
)

# NOTE: use callbacks for early stopping & logging
gbm = lgb.train(
    params,
    train_ds,
    num_boost_round=5000,
    valid_sets=[valid_ds],
    valid_names=["valid"],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
)

print("Best iteration:", gbm.best_iteration)
print("Best valid RMSE:", gbm.best_score["valid"]["rmse"])

# Evaluate on the FULL TEST set
pred = gbm.predict(Xte[FEATS], num_iteration=gbm.best_iteration)
try:
    rmse = mean_squared_error(yte, pred, squared=False)
except TypeError:
    rmse = np.sqrt(mean_squared_error(yte, pred))
mae = mean_absolute_error(yte, pred)
r2  = r2_score(yte, pred)
print(f"[LightGBM] MAE={mae:.3f}  RMSE={rmse:.3f}  R2={r2:.4f}")

Training until validation scores don't improve for 100 rounds
[100]	valid's rmse: 5.91303
[200]	valid's rmse: 5.90146
Early stopping, best iteration is:
[140]	valid's rmse: 5.88559
Best iteration: 140
Best valid RMSE: 5.885585734779989
[LightGBM] MAE=1.732  RMSE=116.728  R2=0.0206


In [28]:
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

def rmse_safe(y_true, y_pred):
    try: return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError: return (mean_squared_error(y_true, y_pred))**0.5

# pick an upper cap *from training only* (no leakage)
cap = ytr.quantile(0.999)  # e.g., ~ $200–$300; adjust if you like

# evaluate on a trimmed TEST (same rule applied)
mask_trim = (yte >= 0) & (yte <= cap)
print("Trim keep %:", float(mask_trim.mean())*100)

pred_trim = pred[mask_trim]
yte_trim  = yte[mask_trim]

print("MAE (trimmed):", mean_absolute_error(yte_trim, pred_trim))
print("RMSE(trimmed):", rmse_safe(yte_trim, pred_trim))
print("R2  (trimmed):", r2_score(yte_trim, pred_trim))

# optional: clipped version instead of dropping
yte_clip  = yte.clip(lower=0, upper=cap)
pred_clip = np.clip(pred, 0, cap)
print("RMSE(clipped):", rmse_safe(yte_clip, pred_clip))
print("R2  (clipped):", r2_score(yte_clip, pred_clip))


Trim keep %: 99.12651521455716
MAE (trimmed): 15.526301938516246
RMSE(trimmed): 24.320588348923422
R2  (trimmed): -0.9449624630175439
RMSE(clipped): 24.42364182107185
R2  (clipped): -0.8489099367166202


In [29]:
# from TRAIN ONLY, choose a target cap (no leakage)
y_cap = ytr.quantile(0.999)

mask_tr = (
    (Xtr["trip_distance"] > 0) &
    (Xtr["trip_duration_min"] > 0) & (Xtr["trip_duration_min"] <= 180) &
    (ytr >= 0) & (ytr <= y_cap)
)
mask_te = (
    (Xte["trip_distance"] > 0) &
    (Xte["trip_duration_min"] > 0) & (Xte["trip_duration_min"] <= 180) &
    (yte >= 0) & (yte <= y_cap)
)

Xtr_clean, ytr_clean = Xtr.loc[mask_tr], ytr.loc[mask_tr]
Xte_clean, yte_clean = Xte.loc[mask_te], yte.loc[mask_te]


In [30]:
# assume y_cap was computed from ytr, e.g. y_cap = ytr.quantile(0.999)
mask_tr = ((Xtr["trip_distance"]>0) &
           (Xtr["trip_duration_min"]>0) & (Xtr["trip_duration_min"]<=180) &
           (ytr>=0) & (ytr<=y_cap))
mask_te = ((Xte["trip_distance"]>0) &
           (Xte["trip_duration_min"]>0) & (Xte["trip_duration_min"]<=180) &
           (yte>=0) & (yte<=y_cap))

Xtr_c, ytr_c = Xtr.loc[mask_tr].copy(), ytr.loc[mask_tr].copy()
Xte_c, yte_c = Xte.loc[mask_te].copy(), yte.loc[mask_te].copy()


In [31]:
import pandas as pd, numpy as np, lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

NUM = ["passenger_count","trip_distance","extra","trip_duration_min","pickup_hour","pickup_dow"]
CAT = ["RatecodeID","PULocationID","DOLocationID","payment_type"]
FEATS = NUM + CAT

# cast/align categories
for c in CAT:
    Xtr_c[c] = Xtr_c[c].astype("category")
    Xte_c[c] = Xte_c[c].astype("category").cat.set_categories(Xtr_c[c].cat.categories)
for c in NUM:
    Xtr_c[c] = pd.to_numeric(Xtr_c[c], errors="coerce")
    Xte_c[c] = pd.to_numeric(Xte_c[c], errors="coerce")

# small train/valid split from TRAIN ONLY
N_TR, N_VA = 1_000_000, 300_000
chunk = Xtr_c.sample(min(N_TR+N_VA, len(Xtr_c)), random_state=42)
tr_idx = chunk.sample(min(N_TR, len(chunk)), random_state=42).index
va_idx = chunk.drop(tr_idx).index

dtr = lgb.Dataset(Xtr_c.loc[tr_idx, FEATS], label=ytr_c.loc[tr_idx], categorical_feature=CAT)
dva = lgb.Dataset(Xtr_c.loc[va_idx, FEATS], label=ytr_c.loc[va_idx], categorical_feature=CAT)

params = dict(objective="regression", metric="rmse",
              learning_rate=0.05, num_leaves=64,
              feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=1,
              min_data_in_leaf=100, verbose=-1)

gbm = lgb.train(params, dtr, num_boost_round=5000,
                valid_sets=[dva], valid_names=["valid"],
                callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)])

# test metrics
def rmse_safe(y, p):
    try: return mean_squared_error(y, p, squared=False)
    except TypeError: return (mean_squared_error(y, p))**0.5

pred = gbm.predict(Xte_c[FEATS], num_iteration=gbm.best_iteration)
print("MAE:", mean_absolute_error(yte_c, pred))
print("RMSE:", rmse_safe(yte_c, pred))
print("R2 :",  r2_score(yte_c, pred))


Training until validation scores don't improve for 100 rounds
[100]	valid's rmse: 2.6478
[200]	valid's rmse: 2.58561
[300]	valid's rmse: 2.56948
[400]	valid's rmse: 2.56854
[500]	valid's rmse: 2.56437
[600]	valid's rmse: 2.56342
[700]	valid's rmse: 2.56295
[800]	valid's rmse: 2.56277
Early stopping, best iteration is:
[741]	valid's rmse: 2.56175
MAE: 1.7695322221889778
RMSE: 3.8811028946485253
R2 : 0.9486536236439578


In [33]:
import pathlib, joblib, time
pathlib.Path("models").mkdir(exist_ok=True)
model_path = f"models/lightgbm_total_amount_{int(time.time())}.txt"
gbm.save_model(model_path, num_iteration=gbm.best_iteration)

meta = {
    "best_iter": int(gbm.best_iteration),
    "metrics": {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)},
    "features": FEATS,
    "cat_levels": {c: list(Xtr_c[c].cat.categories) for c in CAT} if 'Xtr_c' in globals() else {c: list(Xtr[c].cat.categories) for c in CAT}
}
joblib.dump(meta, "models/lightgbm_meta.joblib")
print("Saved:", model_path)

Saved: models/lightgbm_total_amount_1758450341.txt


In [34]:
# ==== Ridge + RandomForest (fast) on CLEANED data, with printed results ====
# Requires: Xtr_c, ytr_c, Xte_c, yte_c already built; NUM, CAT defined.

import time, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- features ---
NUM = ["passenger_count","trip_distance","extra","trip_duration_min","pickup_hour","pickup_dow"]
CAT = ["RatecodeID","PULocationID","DOLocationID","payment_type"]
FEATS = NUM + CAT

# version-safe RMSE
def rmse_safe(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return (mean_squared_error(y_true, y_pred))**0.5

# unified preprocessor (no leakage)
pre = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), NUM),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh",  OneHotEncoder(handle_unknown="ignore", min_frequency=50))
    ]), CAT),
], sparse_threshold=1.0)

results = []

print(">>> Data shapes (cleaned):")
print("Xtr_c:", Xtr_c[FEATS].shape, "| Xte_c:", Xte_c[FEATS].shape, "\n")

# ---- 1) Ridge baseline ----
ridge = Pipeline([("pre", pre), ("est", Ridge(alpha=1.0, random_state=42))])
t0 = time.perf_counter()
ridge.fit(Xtr_c[FEATS], ytr_c)
t_ridge = time.perf_counter() - t0
pred_r = ridge.predict(Xte_c[FEATS])

ridge_mae  = mean_absolute_error(yte_c, pred_r)
ridge_rmse = rmse_safe(yte_c, pred_r)
ridge_r2   = r2_score(yte_c, pred_r)

print("=== Ridge(alpha=1.0) ===")
print(f"Fit time: {t_ridge:.1f}s")
print(f"MAE:  {ridge_mae:.4f}")
print(f"RMSE: {ridge_rmse:.4f}")
print(f"R2:   {ridge_r2:.4f}\n")

results.append(dict(
    model="Ridge(alpha=1.0)",
    MAE=ridge_mae, RMSE=ridge_rmse, R2=ridge_r2, fit_s=round(t_ridge,1)
))

# ---- 2) Random Forest (fast, subsampled) ----
N_TRAIN_RF = min(2_000_000, len(Xtr_c))  # adjust down if you want it faster
idx_rf = Xtr_c.sample(n=N_TRAIN_RF, random_state=42).index

rf = Pipeline([
    ("pre", pre),
    ("est", RandomForestRegressor(
        n_estimators=400, max_depth=16, min_samples_leaf=10, max_features=0.5,
        n_jobs=-1, random_state=42, oob_score=True, verbose=1
    ))
])
t0 = time.perf_counter()
rf.fit(Xtr_c.loc[idx_rf, FEATS], ytr_c.loc[idx_rf])
t_rf = time.perf_counter() - t0
pred_rf = rf.predict(Xte_c[FEATS])

rf_mae  = mean_absolute_error(yte_c, pred_rf)
rf_rmse = rmse_safe(yte_c, pred_rf)
rf_r2   = r2_score(yte_c, pred_rf)
rf_oob  = rf.named_steps["est"].oob_score_

print("=== RandomForest (400 trees, depth=16, leaf=10, max_features=0.5; ~2M train) ===")
print(f"Fit time: {t_rf:.1f}s | OOB R2: {rf_oob:.4f}")
print(f"MAE:  {rf_mae:.4f}")
print(f"RMSE: {rf_rmse:.4f}")
print(f"R2:   {rf_r2:.4f}\n")

results.append(dict(
    model="RandomForest(400,d16,leaf10,mf=0.5; ~2M train)",
    MAE=rf_mae, RMSE=rf_rmse, R2=rf_r2, OOB_R2=rf_oob, fit_s=round(t_rf,1)
))

# ---- 3) (optional) LightGBM row if you've trained `gbm` already ----
try:
    import lightgbm as lgb  # just to confirm availability
    # ensure categories align for test (harmless if already aligned)
    for c in CAT:
        if Xtr_c[c].dtype.name != "category":
            Xtr_c[c] = Xtr_c[c].astype("category")
        Xte_c[c] = Xte_c[c].astype("category").cat.set_categories(Xtr_c[c].cat.categories)

    pred_lgb = gbm.predict(Xte_c[FEATS], num_iteration=getattr(gbm, "best_iteration", None))
    lgb_mae  = mean_absolute_error(yte_c, pred_lgb)
    lgb_rmse = rmse_safe(yte_c, pred_lgb)
    lgb_r2   = r2_score(yte_c, pred_lgb)

    print(f"=== LightGBM (best_iter={getattr(gbm,'best_iteration', 'NA')}) ===")
    print(f"MAE:  {lgb_mae:.4f}")
    print(f"RMSE: {lgb_rmse:.4f}")
    print(f"R2:   {lgb_r2:.4f}\n")

    results.append(dict(
        model=f"LightGBM(best_iter={getattr(gbm,'best_iteration','NA')})",
        MAE=lgb_mae, RMSE=lgb_rmse, R2=lgb_r2, fit_s=np.nan
    ))
except Exception as _e:
    print("(LightGBM row skipped — no `gbm` in memory or prediction failed.)\n")

# ---- Summary table ----
res_df = pd.DataFrame(results).sort_values("R2", ascending=False)
print("=== Summary (sorted by R2) ===")
print(res_df.to_string(index=False))
try:
    display(res_df)  # pretty view in notebooks
except Exception:
    pass


>>> Data shapes (cleaned):
Xtr_c: (27184185, 10) | Xte_c: (11580608, 10) 

=== Ridge(alpha=1.0) ===
Fit time: 96.5s
MAE:  3.3040
RMSE: 6.2279
R2:   0.8678



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 85.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   33.1s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:  1.2min finished


=== RandomForest (400 trees, depth=16, leaf=10, max_features=0.5; ~2M train) ===
Fit time: 5177.0s | OOB R2: 0.9706
MAE:  1.8432
RMSE: 3.9636
R2:   0.9464

=== LightGBM (best_iter=741) ===
MAE:  1.7695
RMSE: 3.8811
R2:   0.9487

=== Summary (sorted by R2) ===
                                         model      MAE     RMSE       R2  fit_s   OOB_R2
                       LightGBM(best_iter=741) 1.769532 3.881103 0.948654    NaN      NaN
RandomForest(400,d16,leaf10,mf=0.5; ~2M train) 1.843154 3.963584 0.946448 5177.0 0.970571
                              Ridge(alpha=1.0) 3.303989 6.227938 0.867783   96.5      NaN


Unnamed: 0,model,MAE,RMSE,R2,fit_s,OOB_R2
2,LightGBM(best_iter=741),1.769532,3.881103,0.948654,,
1,"RandomForest(400,d16,leaf10,mf=0.5; ~2M train)",1.843154,3.963584,0.946448,5177.0,0.970571
0,Ridge(alpha=1.0),3.303989,6.227938,0.867783,96.5,


1. LightGBM
2. Random Forest
3. Ridge

----------------------------------------------------------------------------------------------------------------
                                            Ignore the bottom                                                   
----------------------------------------------------------------------------------------------------------------

In [25]:
TARGET = "total_amount"
NUM = ["passenger_count","trip_distance","extra","trip_duration_min","pickup_hour","pickup_dow"]
CAT = ["RatecodeID","PULocationID","DOLocationID","payment_type"]

ms = m.sort_values("tpep_pickup_datetime")
cut = int(len(ms)*0.70)
train, test = ms.iloc[:cut], ms.iloc[cut:]

Xtr, ytr = train[NUM + CAT], train[TARGET].astype("float32")
Xte, yte = test [NUM + CAT], test [TARGET].astype("float32")

len(train), len(test)


(27759268, 11896830)

In [16]:
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_dt

# required columns present?
req = ["tpep_pickup_datetime","tpep_dropoff_datetime",
       "trip_duration_min","pickup_hour","pickup_dow"]
print("all present:", set(req).issubset(m.columns))

# dtypes look right?
print("pickup dtype:", m["tpep_pickup_datetime"].dtype, "| dropoff dtype:", m["tpep_dropoff_datetime"].dtype)
print("duration dtype:", m["trip_duration_min"].dtype, "| hour dtype:", m["pickup_hour"].dtype)

# NAs
m[req].isna().sum()


all present: True
pickup dtype: datetime64[ns, UTC] | dropoff dtype: datetime64[ns, UTC]
duration dtype: float64 | hour dtype: Int8


tpep_pickup_datetime     0
tpep_dropoff_datetime    0
trip_duration_min        0
pickup_hour              0
pickup_dow               0
dtype: int64

Time-based split (avoid leakage)

In [17]:
TARGET = "total_amount"
NUM = ["passenger_count","trip_distance","extra","trip_duration_min","pickup_hour","pickup_dow"]
CAT = ["RatecodeID","PULocationID","DOLocationID","payment_type"]

ms = m.sort_values("tpep_pickup_datetime")
cut = int(len(ms)*0.70)
train, test = ms.iloc[:cut], ms.iloc[cut:]

Xtr, ytr = train[NUM + CAT], train[TARGET].astype("float32")
Xte, yte = test [NUM + CAT], test [TARGET].astype("float32")

len(train), len(test)


(27759268, 11896830)

In [24]:
# 1) Check that our own RMSE on the VALIDATION subset matches LightGBM's best_score
val_pred = gbm.predict(Xtr.loc[idx_va, FEATS], num_iteration=gbm.best_iteration)
print("Our valid RMSE:", rmse_safe(ytr.loc[idx_va], val_pred))
print("LGBM valid RMSE:", gbm.best_score["valid"]["rmse"])

# 2) Predict on TEST and inspect shapes and ranges
pred = gbm.predict(Xte[FEATS], num_iteration=gbm.best_iteration)
print("Shapes (pred, yte):", pred.shape, yte.shape)
print("pred min/median/max:", float(np.nanmin(pred)), float(np.nanmedian(pred)), float(np.nanmax(pred)))
print("yte  min/median/max:", float(np.nanmin(yte)),  float(np.nanmedian(yte)),  float(np.nanmax(yte)))
print("NaNs (pred, yte):", int(np.isnan(pred).sum()), int(pd.isna(yte).sum()))

# 3) Final metrics on TEST
print("MAE:", mean_absolute_error(yte, pred))
print("RMSE:", rmse_safe(yte, pred))
print("R2:",  r2_score(yte, pred))


Our valid RMSE: 5.885585734779989
LGBM valid RMSE: 5.885585734779989
Shapes (pred, yte): (11896830,) (11896830,)
pred min/median/max: -86.56216676212419 16.04786345244758 292.2178636107599
yte  min/median/max: -1301.8499755859375 15.960000038146973 401095.625
NaNs (pred, yte): 0 0
MAE: 1.732025154858528
RMSE: 116.7277107813011
R2: 0.020595296317712064


* Splitting by time to match how the model will be used in real lift: trained on the past, predicts the future.

* A random split can quietly inflate scores for time-dependent data like taxi trips

Why time-based split?
* Realistic evaluation: Testing on later periods simulates that
* Avoid look-ahead leakage: Global steps (impute medians, scaling, ont-hot categories) must be learned on train only. If future rows is mix into training data, future information might be leak
* Handles temporal correlation: Trips close in time share weather, traffic, events, policy changes. Random splits sprinkle near-identical conditions across train/test -> over-optimistic metrics
* Detects drift: Fare/behavior shift by month/season. A chronological split shows how well the moel generalizes to a new regime

In [21]:
import sklearn, sys, inspect
print("sklearn:", sklearn.__version__)
print("python:", sys.executable)
inspect.signature(__import__("sklearn.metrics").metrics.mean_squared_error)


sklearn: 1.7.2
python: /Users/chrisnjw/.pyenv/versions/myproj-311/bin/python


<Signature (y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')>

In [22]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(yte, pred)

try:
    rmse = mean_squared_error(yte, pred, squared=False)  # new API
except TypeError:
    rmse = np.sqrt(mean_squared_error(yte, pred))        # fallback for old sklearn

r2  = r2_score(yte, pred)
(mae, rmse, r2)

(6.399168288871747, np.float64(55.5506942344371), 0.07257096419322462)

In [23]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

baseline = np.full_like(yte, fill_value=float(ytr.mean()))
b_mae = mean_absolute_error(yte, baseline)
try:
    b_rmse = mean_squared_error(yte, baseline, squared=False)
except TypeError:
    b_rmse = np.sqrt(mean_squared_error(yte, baseline))

mae, rmse, r2, b_mae, b_rmse

(6.399168288871747,
 np.float64(55.5506942344371),
 0.07257096419322462,
 11.60715389251709,
 np.float64(57.711281748161035))

Baseline (Ridge) with proper preprocessing in a pipleline

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

TARGET = "total_amount"
NUM = ["passenger_count","trip_distance","extra","trip_duration_min","pickup_hour","pickup_dow"]
CAT = ["RatecodeID","PULocationID","DOLocationID","payment_type"]

Xtr, ytr = train[NUM+CAT], train[TARGET].astype("float32")
Xte, yte = test [NUM+CAT], test [TARGET].astype("float32")

pre_tree = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), NUM),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh",  OneHotEncoder(handle_unknown="ignore", min_frequency=5))  # collapse rare cats
    ]), CAT),
])

rf = Pipeline([
    ("pre", pre_tree),
    ("est", RandomForestRegressor(
        n_estimators=300,
        max_depth=20,            # cap depth for speed/overfit control
        min_samples_leaf=5,
        n_jobs=-1,
        random_state=42
    ))
])

# warm up on a sample; then remove these two lines for full fit
idx = Xtr.sample(n=1_000_000, random_state=42).index
rf.fit(Xtr.loc[idx], ytr.loc[idx])

# quick eval on a slice; then remove head(...) for full eval
pred = rf.predict(Xte.head(500_000))

# metrics (robust to sklearn version)
mae = mean_absolute_error(yte.head(500_000), pred)
try:
    rmse = mean_squared_error(yte.head(500_000), pred, squared=False)
except TypeError:
    rmse = np.sqrt(mean_squared_error(yte.head(500_000), pred))
r2 = r2_score(yte.head(500_000), pred)
(mae, rmse, r2)


(1.8779829572318527, np.float64(6.585973441997223), 0.8788138401564854)

y = dollar fare I am predicting
X = numeric + categorical predictors based on my time-based split

* Train 300 trees, each grown (up to depth 20) on the 1M training rows after OHE 

MAE = 1.88 dollars
RMSE = 6.59
R^2 = 0.879