# Model Training — Recycling Rate (%)

Goal: Train a robust regressor with categorical-heavy data.
- Recreate minimal FE so this notebook is standalone
- Baselines (Dummy, RandomForest), then CatBoost (categorical-native)
- Stratified holdout by target bins for stable evaluation
- Save best model + metrics


In [None]:
# setup
import sys, os, warnings
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.abspath("../src"))

from data.preprocess import load_data
import pandas as pd
import numpy as np

# load RAW
df = load_data()  # uses  preprocess.py logic (relative, abs, else URL)

# minimal FE (safe to re-run)
if "Latitude" not in df.columns:
    latlon = df["Landfill Location (Lat, Long)"].str.split(",", n=1, expand=True)
    df["Latitude"]  = latlon[0].astype(float)
    df["Longitude"] = latlon[1].str.strip().astype(float)

for col in ["Landfill Name","Landfill Location (Lat, Long)"]:
    if col in df.columns:
        df.drop(columns=col, inplace=True)

# simple interaction that helped: Waste × Method
if "Waste_Method" not in df.columns:
    df["Waste_Method"] = df["Waste Type"] + "|" + df["Disposal Method"]

# optional density bin
if "Density_Bin" not in df.columns:
    try:
        df["Density_Bin"] = pd.qcut(df["Population Density (People/km²)"], 4,
                                    labels=["Low","Mid-Low","Mid-High","High"], duplicates="drop")
    except Exception:
        df["Density_Bin"] = pd.cut(df["Population Density (People/km²)"], 4,
                                   labels=["Low","Mid-Low","Mid-High","High"])

target_col = "Recycling Rate (%)"

# feature sets for sklearn (OneHot model)
cat_cols = [c for c in ["City/District","Waste Type","Disposal Method","Density_Bin"] if c in df.columns]
num_cols = [c for c in df.columns if c not in cat_cols + [target_col, "Waste_Method"]]

X = df[cat_cols + num_cols].copy()
y = df[target_col].astype(float).copy()

# stratified split by target bins (stabilizes holdout)
from sklearn.model_selection import train_test_split
y_bins = pd.qcut(y, q=10, duplicates="drop").astype(str)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y_bins)

len(df), len(cat_cols), len(num_cols), X_tr.shape, X_te.shape


[INFO] using relative path


(850, 4, 9, (680, 13), (170, 13))

In [2]:
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

num_pipe = Pipeline([("impute", SimpleImputer(strategy="median"))])
cat_pipe = Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                     ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])
pre = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)])

dummy = Pipeline([("pre", pre), ("model", DummyRegressor(strategy="mean"))])
dummy.fit(X_tr, y_tr)
pred_d = dummy.predict(X_te)

print({"DUMMY_MAE": round(mean_absolute_error(y_te, pred_d),3),
       "DUMMY_RMSE": round(mean_squared_error(y_te, pred_d, squared=False),3),
       "DUMMY_R2": round(r2_score(y_te, pred_d),3)})


{'DUMMY_MAE': np.float64(13.916), 'DUMMY_RMSE': np.float64(16.119), 'DUMMY_R2': -0.0}


In [3]:
import os, joblib
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
pipe = Pipeline([("pre", pre), ("model", rf)])
pipe.fit(X_tr, y_tr)
pred = pipe.predict(X_te)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rf_metrics = {"MAE": round(mean_absolute_error(y_te, pred),3),
              "RMSE": round(mean_squared_error(y_te, pred, squared=False),3),
              "R2": round(r2_score(y_te, pred),3)}
print(rf_metrics)

os.makedirs("../models", exist_ok=True)
joblib.dump(pipe, "../models/model_rf.pkl")
print("Saved: ../models/model_rf.pkl")


{'MAE': np.float64(14.533), 'RMSE': np.float64(16.765), 'R2': -0.082}
Saved: ../models/model_rf.pkl


In [4]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
import pandas as pd

def make_pipe(model): return Pipeline([("pre", pre), ("model", model)])
kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "RandomForest": RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1),
    "ExtraTrees":   ExtraTreesRegressor(n_estimators=600, random_state=42, n_jobs=-1),
    "GBR":          GradientBoostingRegressor(random_state=42)
}
rows = []
for name, mdl in models.items():
    p = make_pipe(mdl)
    rmse = -cross_val_score(p, X, y, scoring="neg_root_mean_squared_error", cv=kf, n_jobs=-1).mean()
    mae  = -cross_val_score(p, X, y, scoring="neg_mean_absolute_error", cv=kf, n_jobs=-1).mean()
    rows.append([name, mae, rmse])

cmp = pd.DataFrame(rows, columns=["model","cv_MAE","cv_RMSE"]).sort_values("cv_RMSE")
cmp


Unnamed: 0,model,cv_MAE,cv_RMSE
0,RandomForest,14.439324,16.802474
2,GBR,14.688593,17.219227
1,ExtraTrees,14.717835,17.314825


In [5]:
from sklearn.inspection import permutation_importance
import pandas as pd

orig_feat_names = list(X.columns)  # the exact columns given to the pipeline
r = permutation_importance(pipe, X_te, y_te, n_repeats=10, random_state=42, n_jobs=-1)
imp_df = pd.DataFrame({"feature": orig_feat_names, "importance": r.importances_mean}) \
           .sort_values("importance", ascending=False).head(20)
imp_df


Unnamed: 0,feature,importance
2,Disposal Method,0.011379
9,Landfill Capacity (Tons),0.000744
10,Year,-0.002786
0,City/District,-0.004047
5,Population Density (People/km²),-0.005229
3,Density_Bin,-0.006565
7,Cost of Waste Management (₹/Ton),-0.006762
4,Waste Generated (Tons/Day),-0.008042
12,Longitude,-0.012033
11,Latitude,-0.012631


In [6]:
# If not installed in your venv: uncomment the next line, run once
# %pip install catboost -q

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# CatBoost uses raw categoricals, so include the interaction explicitly
cb_cat_cols = [c for c in ["City/District","Waste Type","Disposal Method","Density_Bin","Waste_Method"] if c in df.columns]
cb_num_keep = ["Waste Generated (Tons/Day)","Population Density (People/km²)","Municipal Efficiency Score (1-10)",
               "Cost of Waste Management (₹/Ton)","Awareness Campaigns Count","Landfill Capacity (Tons)","Year"]
cb_num_keep = [c for c in cb_num_keep if c in df.columns]

X_cb = df[cb_cat_cols + cb_num_keep].copy()
y_cb = df[target_col].astype(float).copy()

# stratified holdout again (same idea)
y_bins_cb = pd.qcut(y_cb, q=10, duplicates="drop").astype(str)
Xtr, Xte, ytr, yte = train_test_split(X_cb, y_cb, test_size=0.2, random_state=42, stratify=y_bins_cb)

train_pool = Pool(Xtr, ytr, cat_features=cb_cat_cols)
valid_pool = Pool(Xte, yte, cat_features=cb_cat_cols)

cb = CatBoostRegressor(
    iterations=2000, learning_rate=0.08, depth=6, l2_leaf_reg=5,
    loss_function="RMSE", eval_metric="RMSE",
    random_seed=42, od_type="Iter", od_wait=120, verbose=False
)
cb.fit(train_pool, eval_set=valid_pool, verbose=False)

pred_cb = cb.predict(valid_pool)
cb_metrics = {"CB_MAE": round(mean_absolute_error(yte, pred_cb),3),
              "CB_RMSE": round(mean_squared_error(yte, pred_cb, squared=False),3),
              "CB_R2": round(r2_score(yte, pred_cb),3)}
print(cb_metrics)

import os
os.makedirs("../models", exist_ok=True)
cb.save_model("../models/model_catboost.cbm")
print("Saved: ../models/model_catboost.cbm")


{'CB_MAE': np.float64(13.892), 'CB_RMSE': np.float64(16.089), 'CB_R2': 0.004}
Saved: ../models/model_catboost.cbm


In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

# use same X_cb / y_cb / cb_cat_cols / cb_num_keep as above
cat_idx = [X_cb.columns.get_loc(c) for c in cb_cat_cols]

# stratify labels must be plain labels (not Interval)
y_bins_all = pd.qcut(y_cb, q=10, duplicates="drop").astype(str)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = [
    {"depth": d, "learning_rate": lr, "l2_leaf_reg": l2, "bagging_temperature": bt}
    for d in [4,6,8]
    for lr in [0.03, 0.06]
    for l2 in [3, 6]
    for bt in [0, 0.5]
]

def cv_rmse(params):
    rmses = []
    for tr_idx, va_idx in skf.split(X_cb, y_bins_all):
        tr_pool = Pool(X_cb.iloc[tr_idx], y_cb.iloc[tr_idx], cat_features=cat_idx)
        va_pool = Pool(X_cb.iloc[va_idx], y_cb.iloc[va_idx], cat_features=cat_idx)
        m = CatBoostRegressor(
            iterations=2000, od_type="Iter", od_wait=120, random_seed=42,
            loss_function="RMSE", eval_metric="RMSE", verbose=False,
            depth=params["depth"], learning_rate=params["learning_rate"],
            l2_leaf_reg=params["l2_leaf_reg"], bagging_temperature=params["bagging_temperature"]
        )
        m.fit(tr_pool, eval_set=va_pool, verbose=False)
        p = m.predict(va_pool)
        rmses.append(mean_squared_error(y_cb.iloc[va_idx], p, squared=False))
    return float(np.mean(rmses)), float(np.std(rmses))

rows = []
for p in grid:
    mean_rmse, sd_rmse = cv_rmse(p)
    rows.append({**p, "cv_rmse": mean_rmse, "cv_sd": sd_rmse})

res_df = pd.DataFrame(rows).sort_values("cv_rmse").reset_index(drop=True)
res_df.head(5)


Unnamed: 0,depth,learning_rate,l2_leaf_reg,bagging_temperature,cv_rmse,cv_sd
0,8,0.03,6,0.0,16.064816,0.10249
1,8,0.03,6,0.5,16.064816,0.10249
2,6,0.03,3,0.5,16.080788,0.088043
3,6,0.03,3,0.0,16.080788,0.088043
4,6,0.06,6,0.5,16.081617,0.098125


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import json, os

best = res_df.iloc[0].to_dict()
print("Best (CV):", best)

# stratified holdout (same as before)
y_bins = pd.qcut(y_cb, q=10, duplicates="drop").astype(str)
Xtr, Xte, ytr, yte = train_test_split(X_cb, y_cb, test_size=0.2, random_state=42, stratify=y_bins)

tr_pool = Pool(Xtr, ytr, cat_features=cat_idx)
te_pool = Pool(Xte, yte, cat_features=cat_idx)

best_cb = CatBoostRegressor(
    iterations=2000, od_type="Iter", od_wait=120, random_seed=42,
    loss_function="RMSE", eval_metric="RMSE", verbose=False,
    depth=int(best["depth"]), learning_rate=float(best["learning_rate"]),
    l2_leaf_reg=float(best["l2_leaf_reg"]), bagging_temperature=float(best["bagging_temperature"])
)
best_cb.fit(tr_pool, eval_set=te_pool, verbose=False)

pred = best_cb.predict(te_pool)
mae  = mean_absolute_error(yte, pred)
rmse = mean_squared_error(yte, pred, squared=False)
r2   = r2_score(yte, pred)

print({"CB_tuned_holdout_MAE": round(mae,3), "CB_tuned_holdout_RMSE": round(rmse,3), "CB_tuned_holdout_R2": round(r2,3)})

os.makedirs("../models", exist_ok=True)
best_cb.save_model("../models/model_best_catboost.cbm")
with open("../models/metrics.json","w") as f:
    json.dump({"model":"catboost","params":best,
               "MAE":float(mae),"RMSE":float(rmse),"R2":float(r2)}, f, indent=2)
print("Saved: ../models/model_best_catboost.cbm and ../models/metrics.json")


Best (CV): {'depth': 8.0, 'learning_rate': 0.03, 'l2_leaf_reg': 6.0, 'bagging_temperature': 0.0, 'cv_rmse': 16.06481644403686, 'cv_sd': 0.10249034777362172}
{'CB_tuned_holdout_MAE': np.float64(13.917), 'CB_tuned_holdout_RMSE': np.float64(16.117), 'CB_tuned_holdout_R2': 0.0}
Saved: ../models/model_best_catboost.cbm and ../models/metrics.json


In [9]:
imp_vals = best_cb.get_feature_importance(tr_pool)
feat_names = list(X_cb.columns)
top = sorted(zip(feat_names, imp_vals), key=lambda x: x[1], reverse=True)[:15]
top


[('Cost of Waste Management (₹/Ton)', np.float64(27.55757047542828)),
 ('Disposal Method', np.float64(26.16968675017754)),
 ('Density_Bin', np.float64(20.42021329794006)),
 ('Municipal Efficiency Score (1-10)', np.float64(14.974423150847173)),
 ('Population Density (People/km²)', np.float64(10.87810632560696)),
 ('City/District', np.float64(0.0)),
 ('Waste Type', np.float64(0.0)),
 ('Waste_Method', np.float64(0.0)),
 ('Waste Generated (Tons/Day)', np.float64(0.0)),
 ('Awareness Campaigns Count', np.float64(0.0)),
 ('Landfill Capacity (Tons)', np.float64(0.0)),
 ('Year', np.float64(0.0))]

In [12]:
# --- K-fold target encoding (no leakage) + CatBoost ---
import numpy as np, pandas as pd, os, json
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool

target_col = "Recycling Rate (%)"

# ensure interaction present
if "Waste_Method" not in df.columns:
    df["Waste_Method"] = df["Waste Type"] + "|" + df["Disposal Method"]

# categoricals kept raw for CatBoost
cat_cols_cb = [c for c in ["City/District","Waste Type","Disposal Method","Density_Bin","Waste_Method"] if c in df.columns]

# core numerics (keep it simple)
num_keep = [
    "Waste Generated (Tons/Day)","Population Density (People/km²)","Municipal Efficiency Score (1-10)",
    "Cost of Waste Management (₹/Ton)","Awareness Campaigns Count","Landfill Capacity (Tons)","Year"
]
num_keep = [c for c in num_keep if c in df.columns]

X_all = df[cat_cols_cb + num_keep].copy()
y_all = df[target_col].astype(float).copy()

# stratified split on target (stable holdout)
y_bins = pd.qcut(y_all, q=10, duplicates="drop").astype(str)
X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, test_size=0.2, random_state=42, stratify=y_bins)

# ---- leakage-safe K-fold target encoding on TRAIN only ----
enc_cols = [c for c in ["City/District","Waste Type","Disposal Method","Waste_Method"] if c in X_tr.columns]

def kfold_target_encode(X_train, y_train, X_valid, col, n_splits=5, noise=0.01):
    oof = pd.Series(index=X_train.index, dtype=float)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for tr_idx, va_idx in kf.split(X_train):
        tr_ids = X_train.iloc[tr_idx].index
        va_ids = X_train.iloc[va_idx].index
        means = y_train.loc[tr_ids].groupby(X_train.loc[tr_ids, col]).mean()
        oof.loc[va_ids] = X_train.loc[va_ids, col].map(means)
    global_mean = y_train.mean()
    oof = oof.fillna(global_mean)
    if noise and noise > 0:
        oof = oof * (1 + noise*np.random.randn(len(oof)))
    mapping = y_train.groupby(X_train[col]).mean()
    te_valid = X_valid[col].map(mapping).fillna(global_mean)
    return oof.values, te_valid.values

Xtr = X_tr.copy(); Xte = X_te.copy()
te_feature_names = []
for c in enc_cols:
    tr_vals, te_vals = kfold_target_encode(Xtr, y_tr, Xte, c, n_splits=5, noise=0.01)
    newc = f"TE_{c.replace('/','_').replace(' ','_')}"
    Xtr[newc] = tr_vals
    Xte[newc] = te_vals
    te_feature_names.append(newc)

# cat feature indices (positions of raw categoricals in current dataframe)
cat_idx = [Xtr.columns.get_loc(c) for c in cat_cols_cb]

# --- CatBoost (use tuned params found earlier) ---
best_params = dict(depth=8, learning_rate=0.03, l2_leaf_reg=6, bagging_temperature=0.5)
cb_te = CatBoostRegressor(
    iterations=2000, od_type="Iter", od_wait=120, random_seed=42,
    loss_function="RMSE", eval_metric="RMSE", verbose=False, **best_params
)

train_pool = Pool(Xtr, y_tr, cat_features=cat_idx)
test_pool  = Pool(Xte, y_te, cat_features=cat_idx)

cb_te.fit(train_pool, eval_set=test_pool, verbose=False)
pred = cb_te.predict(test_pool)

mae  = mean_absolute_error(y_te, pred)
rmse = mean_squared_error(y_te, pred, squared=False)
r2   = r2_score(y_te, pred)

print({"CB_TE_holdout_MAE": round(mae,3), "CB_TE_holdout_RMSE": round(rmse,3), "CB_TE_holdout_R2": round(r2,3)})

# save artifacts
os.makedirs("../models", exist_ok=True)
cb_te.save_model("../models/model_best_catboost_te.cbm")
with open("../models/metrics_target_encoding.json","w") as f:
    json.dump({"model":"catboost_te","params":best_params,
               "MAE":float(mae),"RMSE":float(rmse),"R2":float(r2),
               "te_features": te_feature_names}, f, indent=2)
print("Saved: ../models/model_best_catboost_te.cbm and ../models/metrics_target_encoding.json")


{'CB_TE_holdout_MAE': np.float64(13.915), 'CB_TE_holdout_RMSE': np.float64(16.111), 'CB_TE_holdout_R2': 0.001}
Saved: ../models/model_best_catboost_te.cbm and ../models/metrics_target_encoding.json
