## Imports

In [1]:
import numpy as np
import pandas as pd
import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns

parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from preprocess.preprocess import Preprocess
from regression_tree.regression_tree import MyDecisionTreeRegressor
from random_forest.random_forest import MyRandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import copy
from sklearn.base import clone
import importlib
import eval.eval as eval_mod

importlib.reload(eval_mod)
from eval.eval import plot_used_cars_results

sns.set_theme(style="whitegrid")

## Preprocessing

In [2]:
pp = Preprocess('student_habits_performance.csv', 'exam_score', scale=True)
results_df = pd.DataFrame(
    
)

## Data splitting

In [3]:
X_train, X_test, y_train, y_test = pp.preprocess_data()

## Define configs, Hold-out accuracy

In [4]:
experiment_configs = [
    {
        "name": "Linear Regression",
        "model": Ridge(),
        "params": {} 
    },
    {
        "name": "Decision Tree",
        "model": DecisionTreeRegressor(random_state=42),
        "params": {"random_state": 42}
    },
    {
        "name": "Random Forest (10 Trees)",
        "model": RandomForestRegressor(random_state=42),
        "params": {"random_state": 42}
    },
    {
        "name": "MyDecisionTreeRegressor post-pruned",
        "model": MyDecisionTreeRegressor(max_depth=20, min_samples=20),
        "params": {"pruned": True, "max_depth": 20, "min_samples": 20}
    },
    {
        "name": "MyDecisionTreeRegressor without post-pruning",
        "model": MyDecisionTreeRegressor(max_depth=20, min_samples=20),
        "params": {"pruned": False, "max_depth": 20, "min_samples": 20}
    },
    {
        "name": "MyRandomForestRegressor 100 trees, 0.5 max features",
        "model": MyRandomForestRegressor(max_depth=20, min_samples=10, n_estimators=100, max_features=0.5),
        "params": {"max_depth":20, "min_samples":10, "n_estimators":100, "max_features":0.5}
    },
    {
        "name": "MyRandomForestRegressor 100 trees, log2 max features",
        "model": MyRandomForestRegressor(max_depth=20, min_samples=10, n_estimators=100, max_features="log2"),
        "params": {"max_depth":20, "min_samples":10, "n_estimators":100, "max_features":"log2"}
    },
    {
        "name": "MyRandomForestRegressor 500 trees, 0.6 max features",
        "model": MyRandomForestRegressor(max_depth=20, min_samples=10, n_estimators=500, max_features=0.6),
        "params": {"max_depth":20, "min_samples":10, "n_estimators":500, "max_features":0.6}
    },
    
]

models_store = {}
preds_test = {}

results = []

# Loop for models
for config in experiment_configs:
    model = config["model"]
    name = config["name"]
    params = config["params"]
    
    model.fit(X_train, y_train)

    # post prune for custom tree 
    if "MyDecisionTreeRegressor" in name and params.get("pruned"):
        model.post_prune_with_cross_validation(X_train, y_train, n_splits=10)

    # predict
    y_pred = model.predict(X_test)

    # evaluate
    r2 = float(r2_score(y_test, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))

    # store for later plotting
    models_store[name] = model
    preds_test[name] = y_pred

    results.append({"model_name": name, "rsquared": r2, "rmse": rmse, "params": params})

results_df = pd.DataFrame(results).sort_values(["rmse"], ascending=True).reset_index(drop=True)
results_df[["model_name", "rsquared", "rmse", "params"]]

Unnamed: 0,model_name,rsquared,rmse,params
0,Linear Regression,0.878542,5.260726,{}
1,Random Forest (10 Trees),0.810991,6.56256,{'random_state': 42}
2,"MyRandomForestRegressor 500 trees, 0.6 max fea...",0.706762,8.174142,"{'max_depth': 20, 'min_samples': 10, 'n_estima..."
3,"MyRandomForestRegressor 100 trees, 0.5 max fea...",0.65714,8.838751,"{'max_depth': 20, 'min_samples': 10, 'n_estima..."
4,MyDecisionTreeRegressor post-pruned,0.638145,9.080289,"{'pruned': True, 'max_depth': 20, 'min_samples..."
5,MyDecisionTreeRegressor without post-pruning,0.638099,9.080858,"{'pruned': False, 'max_depth': 20, 'min_sample..."
6,Decision Tree,0.599856,9.548612,{'random_state': 42}
7,"MyRandomForestRegressor 100 trees, log2 max fe...",0.321791,12.431224,"{'max_depth': 20, 'min_samples': 10, 'n_estima..."


## 10-fold CV accuracy

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

cv_rows = []

for config in experiment_configs:
    name = config["name"]
    base_model = config["model"]
    params = config["params"]

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train), start=1):
        # Create folds
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

        if hasattr(base_model, "get_params"):
            model = clone(base_model)
        else:
            model = copy.deepcopy(base_model)

        model.fit(X_fold_train, y_fold_train)

        # Post-prune for the pruned custom decision tree regressor
        if "MyDecisionTreeRegressor" in name and params.get("pruned"):
            model.post_prune_with_cross_validation(X_fold_train, y_fold_train, n_splits=10)

        y_pred = model.predict(X_fold_val)

        cv_rows.append(
            {
                "model_name": name,
                "fold": fold_idx,
                "r2": float(r2_score(y_fold_val, y_pred)),
                "rmse": float(np.sqrt(mean_squared_error(y_fold_val, y_pred))),
            }
        )

cv_long_df = pd.DataFrame(cv_rows)

df_results = (
    cv_long_df.groupby("model_name")
    .agg(mean_r2=("r2", "mean"), std_r2=("r2", "std"), mean_rmse=("rmse", "mean"), std_rmse=("rmse", "std"))
    .reset_index()
    .sort_values(["mean_rmse"], ascending=True)
)

df_results

In [None]:
plot_used_cars_results(
    dataset_name="student_habits_performance.csv",
    results_df=results_df,
    df_results=df_results,
    cv_long_df=cv_long_df,
    y_test=y_test,
    preds_test=preds_test,
    models_store=models_store,
    X_test=X_test,
    feature_names=getattr(pp, "feature_names_", None),
)