<div align="center">
    
# 5.0 Modeling

## 5.1 Table of Contents<a id='5.1_Table_of_Contents'></a>
* [5.1 Table of Contents](#5.1_Table_of_Contents)
* [5.2 Introduction](#5.2_Introduction)
* [5.3 Library Imports](#5.3_Library_Imports)
* [5.4 Load Data & Pipeline](#5.4_Load_Data_Pipeline)
* [5.5 Train/Test Split](#5.5_Train_Test_Split)
* [5.6 Baseline Models](#5.6_Baseline_Models)
* [5.7 Cross-Validation](#5.7_Cross_Validation)
* [5.8 Model Selection (Grid Search)](#5.8_Model_Selection)
* [5.9 Evaluation on Test Set](#5.9_Evaluation_on_Test_Set)
* [5.10 Save Artifacts](#5.10_Save_Artifacts)
* [5.11 Summary](#5.11_Summary)

## 5.2 Introduction<a id='5.2_Introduction'></a>

Train baseline models, run cross-validation, tune the best candidate, evaluate on a held-out test set, and save artifacts.

## 5.3 Library Imports<a id='5.3_Library_Imports'></a>

In [None]:
import os, json
import numpy as np
import pandas as pd
import joblib
from pprint import pprint

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report,
    mean_absolute_error, mean_squared_error, r2_score
)

import matplotlib.pyplot as plt

SEED = 17
np.random.seed(SEED)


## 5.4 Load Data & Pipeline<a id='5.4_Load_Data_Pipeline'></a>

In [None]:
DATA_PATH = "../data/processed/data_03_featured.csv"
PIPELINE_PATH = "../models/preprocess_pipeline.joblib"

df = pd.read_csv(DATA_PATH)
print("Data shape:", df.shape)
df.head()

TARGET = "target_column_name"  # <-- CHANGE THIS
assert TARGET in df.columns, f"Target '{TARGET}' not found in dataframe columns."

preprocess: ColumnTransformer = joblib.load(PIPELINE_PATH)
print("Loaded preprocess pipeline.")


## 5.5 Train/Test Split<a id='5.5_Train_Test_Split'></a>

In [None]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

def infer_task_type(y_series, threshold_unique=20):
    if y_series.dtype.name in ["object", "bool", "category"]:
        return "classification"
    return "classification" if y_series.nunique(dropna=True) <= threshold_unique else "regression"

TASK = infer_task_type(y)
print("Inferred task type:", TASK)

split_kwargs = dict(test_size=0.2, random_state=SEED)
if TASK == "classification":
    split_kwargs["stratify"] = y

X_train, X_test, y_train, y_test = train_test_split(X, y, **split_kwargs)
X_train.shape, X_test.shape

## 5.6 Baseline Models<a id='5.6_Baseline_Models'></a>

In [None]:
if TASK == "classification":
    candidates = {
        "logreg": LogisticRegression(max_iter=1000),
        "rf_clf": RandomForestClassifier(n_estimators=200, random_state=SEED)
    }
else:
    candidates = {
        "linreg": LinearRegression(),
        "rf_reg": RandomForestRegressor(n_estimators=300, random_state=SEED)
    }

models = {name: Pipeline([("preprocess", preprocess), ("model", model)])
          for name, model in candidates.items()}

print("Candidates:", list(models.keys()))


## 5.7 Cross-Validation<a id='5.7_Cross_Validation'></a>

In [None]:
if TASK == "classification":
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    scoring = "f1_macro"
else:
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    scoring = "neg_root_mean_squared_error"

cv_scores = {}
for name, pipe in models.items():
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    cv_scores[name] = {"mean": scores.mean(), "std": scores.std(), "all": scores}
    print(f"{name}: {scoring} = {scores.mean():.4f} ± {scores.std():.4f}")

best_name = max(cv_scores, key=lambda k: cv_scores[k]["mean"])
print("Best by CV:", best_name, "->", cv_scores[best_name]["mean"])
best_baseline = models[best_name]


## 5.8 Model Selection (Grid Search)<a id='5.8_Model_Selection'></a>

In [None]:
if TASK == "classification" and best_name == "rf_clf":
    param_grid = {
        "model__n_estimators": [200, 400],
        "model__max_depth": [None, 10, 20],
        "model__min_samples_split": [2, 5]
    }
elif TASK == "classification" and best_name == "logreg":
    param_grid = {
        "model__C": [0.5, 1.0, 2.0],
        "model__penalty": ["l2"],
        "model__solver": ["lbfgs"]
    }
elif TASK == "regression" and best_name == "rf_reg":
    param_grid = {
        "model__n_estimators": [300, 600],
        "model__max_depth": [None, 12, 24],
        "model__min_samples_split": [2, 5]
    }
elif TASK == "regression" and best_name == "linreg":
    param_grid = {}
else:
    param_grid = {}

if param_grid:
    search = GridSearchCV(best_baseline, param_grid, cv=cv, scoring=scoring, n_jobs=-1, verbose=0)
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    print("Best params:", search.best_params_)
    print("Best CV score:", search.best_score_)
else:
    print("No grid search performed (empty grid). Using best baseline.")
    best_model = best_baseline.fit(X_train, y_train)


## 5.9 Evaluation on Test Set<a id='5.9_Evaluation_on_Test_Set'></a>

In [None]:
if not param_grid:
    best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

if TASK == "classification":
    y_proba = None
    try:
        y_proba = best_model.predict_proba(X_test)[:, 1]
    except Exception:
        pass

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="macro")
    print(f"Accuracy: {acc:.4f} | Precision(macro): {prec:.4f} | Recall(macro): {rec:.4f} | F1(macro): {f1:.4f}")

    if y_proba is not None and y_test.nunique() == 2:
        auc = roc_auc_score(y_test, y_proba)
        print(f"ROC AUC: {auc:.4f}")

    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

else:
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    print(f"MAE: {mae:.4f} | RMSE: {rmse:.4f} | R^2: {r2:.4f}")

    plt.figure(figsize=(6,4))
    plt.scatter(y_pred, y_test - y_pred, s=12)
    plt.axhline(0, ls='--')
    plt.xlabel("Predicted")
    plt.ylabel("Residuals")
    plt.title("Residuals vs Predicted")
    plt.show()

## 5.10 Save Artifacts<a id='5.10_Save_Artifacts'></a>

In [None]:
os.makedirs("../models", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

MODEL_PATH = "../models/best_model.joblib"
joblib.dump(best_model, MODEL_PATH)

report = {"task": TASK, "cv_scores": {k: {"mean": float(v["mean"]), "std": float(v["std"])} for k, v in cv_scores.items()}}
if TASK == "classification":
    report["test_metrics"] = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision_macro": float(precision_score(y_test, y_pred, average="macro", zero_division=0)),
        "recall_macro": float(recall_score(y_test, y_pred, average="macro", zero_division=0)),
        "f1_macro": float(f1_score(y_test, y_pred, average="macro"))
    }
else:
    report["test_metrics"] = {
        "mae": float(mean_absolute_error(y_test, y_pred)),
        "rmse": float(mean_squared_error(y_test, y_pred, squared=False)),
        "r2": float(r2_score(y_test, y_pred))
    }

REPORT_PATH = "../models/model_report.json"
with open(REPORT_PATH, "w") as f:
    json.dump(report, f, indent=2)

preds = pd.DataFrame({"y_true": y_test})
preds["y_pred"] = y_pred
PRED_PATH = "../data/processed/predictions_test.csv"
preds.to_csv(PRED_PATH, index=False)

print("✅ Saved:")
print(" -", MODEL_PATH)
print(" -", REPORT_PATH)
print(" -", PRED_PATH)

## 5.11 Summary<a id='5.11_Summary'></a>

In [None]:
Verify TARGET column is correct.

Compare CV vs Test metrics.

If overfitting, consider regularization or feature reduction.

If underfitting, explore model complexity or feature expansion.

Extend grid search or test other algorithms (XGBoost, LightGBM, etc.).