## Pipeline + GridSearchCV
<p> Define Column Groups</p>

In [3]:
import pandas as pd 
data=pd.read_csv(r"C:\Users\User\Desktop\Messy-Food-Waste-Prediction\data\cleaned_data\cleaned_food_waste_data.csv")

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Split target & features
X = data.drop(columns="food_waste_kg")  # original, unprocessed
y = data["food_waste_kg"]

# Define column types
numerical_cols = ["meals_served", "kitchen_staff", "temperature_C", "humidity_percent", "past_waste_kg"]
categorical_cols = ["waste_category", "special_event"]
ordinal_col = ["staff_experience"]

# Order for ordinal encoding
experience_order = [["Beginner", "Intermediate", "Expert", "Unknown"]]

In [5]:
# Numerical pipeline: impute + scale
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical pipeline: impute + one-hot
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

# Ordinal pipeline
ord_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder(categories=experience_order))
])

In [6]:
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_cols),
    ("cat", cat_pipeline, categorical_cols),
    ("ord", ord_pipeline, ordinal_col)
])

In [7]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", GradientBoostingRegressor(random_state=42))
])

In [8]:
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 5],
    "model__learning_rate": [0.05, 0.1]
}

In [9]:
X = data.drop(columns="food_waste_kg")  # original, unprocessed
y = data["food_waste_kg"]

# Don't encode or scale anything before this
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train) 

In [10]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = grid.predict(X_test)
print("Best Parameters:", grid.best_params_)
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

Best Parameters: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 200}
R² Score: 0.8008142089061814
RMSE: 4.802962761499908




## build & train model

In [11]:
!pip install xgboost



In [15]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Model dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(objective="reg:squarederror", random_state=42)
}

results = {}
best_model_name = None
best_r2 = -float("inf")  # lowest possible R²
best_pipeline = None

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Evaluation
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        "MAE": mae,
        "RMSE": rmse,
        "R²": r2
    }

    # Track best model
    if r2 > best_r2:
        best_r2 = r2
        best_model_name = name
        best_pipeline = pipeline

# Save the best model pipeline
joblib.dump(best_pipeline, "best_food_waste_model.pkl")
print(f"✅ Saved best model: {best_model_name} with R² = {best_r2:.4f}")





✅ Saved best model: Random Forest with R² = 0.7776


In [16]:

import pandas as pd

# Convert results to DataFrame
results_df = pd.DataFrame(results).T.reset_index()
results_df.rename(columns={"index": "Model"}, inplace=True)

# Save it
results_df.to_csv("model_results.csv", index=False)
