In [1]:
import sys
sys.path.append("..")

import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

from src.preprocessing import get_preprocessor


def train_and_select_best_model(
    data_path: str,
    model_path: str = "../models/best_insurance_model.pkl"
):
    # Load data
    df = pd.read_csv(data_path)

    X = df.drop("charges", axis=1)
    y = df["charges"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        "LinearRegression": LinearRegression(),
        "DecisionTree": DecisionTreeRegressor(max_depth=6, random_state=42),
        "SVR": SVR(kernel="rbf", C=100, gamma="scale"),
        "GradientBoosting": GradientBoostingRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=3,
            random_state=42
        )
    }

    best_model = None
    best_score = -float("inf")

    for name, model in models.items():
        pipeline = Pipeline(
            steps=[
                ("preprocessor", get_preprocessor()),
                ("model", model)
            ]
        )

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        print(f"{name} | R2: {r2:.4f} | MAE: {mae:.2f}")

        if r2 > best_score:
            best_score = r2
            best_model = pipeline

    print(f"\nBest Model R2 Score: {best_score:.4f}")
    joblib.dump(best_model, model_path)
    print(f"Best model saved to {model_path}")


if __name__ == "__main__":
    train_and_select_best_model("../data/insurance.csv")


LinearRegression | R2: 0.7836 | MAE: 4181.19
DecisionTree | R2: 0.8062 | MAE: 2980.24
SVR | R2: 0.0550 | MAE: 6314.11
GradientBoosting | R2: 0.8804 | MAE: 2456.45

Best Model R2 Score: 0.8804
Best model saved to ../models/best_insurance_model.pkl
