## Imports

In [6]:
import numpy as np
import pandas as pd
import os
import sys

parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from preprocess.preprocess import Preprocess
from regression_tree.regression_tree import MyDecisionTreeRegressor
from random_forest.random_forest import MyRandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import copy
from sklearn.base import clone

## Preprocessing

In [2]:
pp = Preprocess('student_habits_performance.csv', 'exam_score', scale=True)
results_df = pd.DataFrame(
    
)

## Data splitting

In [3]:
X_train, X_test, y_train, y_test = pp.preprocess_data()

## Define configs, Hold-out accuracy

In [None]:
experiment_configs = [
    {
        "name": "Linear Regression",
        "model": Ridge(),
        "params": {} 
    },
    {
        "name": "Decision Tree",
        "model": DecisionTreeRegressor(random_state=42),
        "params": {"random_state": 42}
    },
    {
        "name": "Random Forest (10 Trees)",
        "model": RandomForestRegressor(random_state=42),
        "params": {"random_state": 42}
    },
    {
        "name": "MyDecisionTreeRegressor post-pruned",
        "model": MyDecisionTreeRegressor(max_depth=20, min_samples=20),
        "params": {"pruned": True, "max_depth": 20, "min_samples": 20}
    },
    {
        "name": "MyDecisionTreeRegressor without post-pruning",
        "model": MyDecisionTreeRegressor(max_depth=20, min_samples=20),
        "params": {"pruned": False, "max_depth": 20, "min_samples": 20}
    },
    {
        "name": "MyRandomForestRegressor 100 trees, 0.5 max features",
        "model": MyRandomForestRegressor(max_depth=20, min_samples=20, n_estimators=100, max_features=0.5),
        "params": {"max_depth":20, "min_samples":20, "n_estimators":100, "max_features":0.5}
    },
    {
        "name": "MyRandomForestRegressor 100 trees, log2 max features",
        "model": MyRandomForestRegressor(max_depth=20, min_samples=20, n_estimators=100, max_features="log2"),
        "params": {"max_depth":20, "min_samples":20, "n_estimators":100, "max_features":"log2"}
    },
    {
        "name": "MyRandomForestRegressor 500 trees, 0.9 max features",
        "model": MyRandomForestRegressor(max_depth=20, min_samples=20, n_estimators=500, max_features=0.9),
        "params": {"max_depth":20, "min_samples":20, "n_estimators":500, "max_features":0.9}
    },
    
]

results = []

# Loop for models
for config in experiment_configs:
    model = config["model"]
    name = config["name"]
    params = config["params"]
    
    # Train
    model.fit(X_train, y_train)
    
    # because names aren't just the model name now
    if "MyDecisionTreeRegressor" in name and params.get("pruned"):
        model.post_prune_with_cross_validation(X_train, y_train, n_splits=10)

    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Store results
    results.append({
        "model_name": name,
        "rsquared": r2,
        "rmse": rmse,
        "params": params
    })

# Create dataframe of the results
results_df = pd.DataFrame(results)

print(results_df)

                                          model_name  rsquared       rmse  \
0                                  Linear Regression  0.878542   5.260726   
1                                      Decision Tree  0.599856   9.548612   
2                           Random Forest (10 Trees)  0.810991   6.562560   
3                MyDecisionTreeRegressor post-pruned  0.638145   9.080289   
4       MyDecisionTreeRegressor without post-pruning  0.638099   9.080858   
5  MyRandomForestRegressor 100 trees, 0.5 max fea...  0.515312  10.509054   
6  MyRandomForestRegressor 100 trees, log2 max fe...  0.132820  14.056815   
7  MyRandomForestRegressor 500 trees, 0.5 max fea...  0.813154   6.524901   

                                              params  
0                                                 {}  
1                               {'random_state': 42}  
2                               {'random_state': 42}  
3  {'pruned': True, 'max_depth': 20, 'min_samples...  
4  {'pruned': False, 'max_depth

## 10-fold CV accuracy

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

results_cv = []


for config in experiment_configs:
    name = config["name"]
    base_model = config["model"]
    params = config["params"]
    
    
    fold_r2 = []
    fold_rmse = []
    
    for train_idx, val_idx in cv.split(X_train):
        
        # Create folds
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
        
        if hasattr(base_model, 'get_params'):
             model = clone(base_model)  
        else:
             model = copy.deepcopy(base_model)  
        
        # Fit base model on the fold
        model.fit(X_fold_train, y_fold_train)
        
        # Post-prune for the pruned custom decision tree regressor
        if "MyDecisionTreeRegressor" in name and config["params"].get("pruned"):
            model.post_prune_with_cross_validation(X_fold_train, y_fold_train, n_splits=10)
        
        # Predictions
        y_pred = model.predict(X_fold_val)
        
        # Calculate metrics
        fold_r2.append(r2_score(y_fold_val, y_pred))
        fold_rmse.append(np.sqrt(mean_squared_error(y_fold_val, y_pred)))
    
    # Store results
    results_cv.append({
        "model_name": name,
        "mean_r2": np.mean(fold_r2),
        "mean_rmse": np.mean(fold_rmse)
    })

# Create datafram
df_results = pd.DataFrame(results_cv)
print(df_results)