# Training a Healthy Meal Classifier

This notebook trains a simple classifier that predicts whether a meal is
"healthy" or "unhealthy" based on its nutritional values.

- Input data: `dataset/healthy_eating_dataset.csv`
- Target column: `is_healthy` (0 = unhealthy, 1 = healthy)
- Features used:
  - calories
  - protein_g
  - carbs_g
  - fat_g
  - fiber_g
  - sugar_g
  - sodium_mg

We will train a Logistic Regression model with class balancing to handle
the imbalance between healthy and unhealthy meals, evaluate it, and save
the trained pipeline to `models/health_classifier.joblib`.

This model will later be used inside the agent as a tool.


# Imports

In [None]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score
)

import joblib



# Make sure relative paths are from project root when you run this notebook.
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
DATA_DIR = os.path.join(BASE_DIR, "dataset")
MODELS_DIR = os.path.join(BASE_DIR, "models")

os.makedirs(MODELS_DIR, exist_ok=True)

print("Base dir:", BASE_DIR)
print("Data dir:", DATA_DIR)
print("Models dir:", MODELS_DIR)


# Load dataset

In [None]:
healthy_csv_path = os.path.join(DATA_DIR, "healthy_eating_dataset.csv")

print("Healthy dataset path:", healthy_csv_path)

df = pd.read_csv(healthy_csv_path)
print("Healthy dataset shape:", df.shape)
df.head()


## General infos

In [None]:
df.info()

print("\nClass distribution for is_healthy:")
print(df["is_healthy"].value_counts())

print("\nClass proportion:")
print(df["is_healthy"].value_counts(normalize=True))


## Cleaning and sanity check

In [None]:
FEATURE_COLUMNS = [
    "calories",
    "protein_g",
    "carbs_g",
    "fat_g",
    "fiber_g",
    "sugar_g",
    "sodium_mg",
]

TARGET_COLUMN = "is_healthy"

# Remove exact duplicates
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"Removed {before - after} duplicate rows.")

# Drop rows with missing values in features or target
df = df.dropna(subset=FEATURE_COLUMNS + [TARGET_COLUMN])

# Simple numerical sanity filters (values outside are highly unlikely)
df = df[
    (df["calories"] > 0)
    & (df["protein_g"] >= 0)
    & (df["carbs_g"] >= 0)
    & (df["fat_g"] >= 0)
    & (df["fiber_g"] >= 0)
    & (df["sugar_g"] >= 0)
    & (df["sodium_mg"] >= 0)
]

print("\nDataset shape after basic cleaning:", df.shape)

df[FEATURE_COLUMNS].describe()


# Train/test split

In [None]:
X = df[FEATURE_COLUMNS].values
y = df[TARGET_COLUMN].values

# 1) Split test (benchmark final)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

# 2) Split validation (pour choisir le threshold)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,      # 20% de (train_full) => 16% du dataset total
    random_state=42,
    stratify=y_train_full,
)

print("Train size:", X_train.shape[0])
print("Val size:  ", X_val.shape[0])
print("Test size: ", X_test.shape[0])


# Pipeline

In [None]:
log_reg = LogisticRegression(
    class_weight="balanced",  # handle class imbalance
    max_iter=500,
    solver="liblinear",
)

pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("clf", log_reg),
    ]
)

pipeline


# Training

In [None]:
pipeline.fit(X_train, y_train)
y_proba_val = pipeline.predict_proba(X_val)[:, 1]  # proba classe 1 (healthy)

print("Training completed.")


# Evaluate

In [None]:
y_proba = pipeline.predict_proba(X_test)[:, 1]  # probability of class 1 (healthy)
y_pred_05 = (y_proba >= 0.5).astype(int)

print("=== Evaluation with default threshold 0.5 ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_05):.3f}")
print(f"ROC-AUC:  {roc_auc_score(y_test, y_proba):.3f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred_05))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred_05))


## Treshold for healthy class

In [None]:


def pick_threshold_balanced_accuracy(y_true, y_proba, step=0.01):
    thresholds = np.arange(0.0, 1.0 + step, step)

    best_thr = 0.5
    best_score = -1.0
    best_row = None

    for thr in thresholds:
        y_pred = (y_proba >= thr).astype(int)

        # Balanced accuracy
        bal_acc = balanced_accuracy_score(y_true, y_pred)

        # Extra metrics (useful to report)
        prec = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
        rec = recall_score(y_true, y_pred, pos_label=1, zero_division=0)
        f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0)

        if bal_acc > best_score:
            best_score = bal_acc
            best_thr = thr
            best_row = (bal_acc, prec, rec, f1)

    return best_thr, best_row

best_thr, (bal_acc, prec, rec, f1) = pick_threshold_balanced_accuracy(y_val, y_proba_val, step=0.01)

print("Selected threshold (max balanced accuracy):", round(best_thr, 2))
print("Validation metrics at threshold:",
      f"balanced_acc={bal_acc:.3f}, precision={prec:.3f}, recall={rec:.3f}, f1={f1:.3f}")


# Evaluate

In [None]:
y_proba_test = pipeline.predict_proba(X_test)[:, 1]

# Baseline 0.5
y_pred_05 = (y_proba_test >= 0.5).astype(int)

# Tuned threshold (choisi sur val)
y_pred_best = (y_proba_test >= best_thr).astype(int)

print("=== TEST with threshold 0.5 (baseline) ===")
print(confusion_matrix(y_test, y_pred_05))
print(classification_report(y_test, y_pred_05))

print("\n=== TEST with tuned threshold (from VAL) ===")
print("Chosen threshold:", best_thr)
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))


# Save the model

In [None]:
model_path = os.path.join(MODELS_DIR, "health_classifier.joblib")
model_bundle = {
    "pipeline": pipeline,
    "feature_columns": FEATURE_COLUMNS,
    "decision_threshold": float(best_thr),
}

joblib.dump(model_bundle, model_path)

print(f"Saved health classifier to: {model_path}")
print("Decision threshold stored:", best_thr)


# Sanity check

In [None]:
loaded = joblib.load(model_path)
loaded_pipeline = loaded["pipeline"]
loaded_features = loaded["feature_columns"]
loaded_thr = loaded["decision_threshold"]

print("Loaded feature columns:", loaded_features)
print("Loaded decision threshold:", loaded_thr)

sample = X_test[0:1]
proba = loaded_pipeline.predict_proba(sample)[0, 1]
pred = int(proba >= loaded_thr)

print("Example probability of being healthy:", proba)
print("Example prediction with threshold:", pred)
