In [6]:
import numpy as np
import pandas as pd
import os
import sys


parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from preprocess.preprocess import Preprocess
from regression_tree.regression_tree import MyDecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate, KFold

In [7]:
pp = Preprocess('used_cars.csv', 'price', scale=True)
results_df = pd.DataFrame(
    
)

In [8]:
X_train, X_test, y_train, y_test = pp.preprocess_data()

In [9]:
experiment_configs = [
    {
        "name": "Linear Regression",
        "model": Ridge(),
        "params": {} 
    },
    {
        "name": "Decision Tree",
        "model": DecisionTreeRegressor(random_state=42),
        "params": {"random_state": 42}
    },
    {
        "name": "Random Forest (10 Trees)",
        "model": RandomForestRegressor(random_state=42),
        "params": {"random_state": 42}
    },
    {
        "name": "MyDecisionTreeRegressor",
        "model": MyDecisionTreeRegressor(max_depth=15, min_samples=5),
        "params": {"pruned": True, "max_depth": 15}
    },
    {
        "name": "MyDecisionTreeRegressor",
        "model": MyDecisionTreeRegressor(max_depth=15, min_samples=5),
        "params": {"pruned": False, "max_depth": 15}
    }
]

results = []

# Loop for models
for config in experiment_configs:
    model = config["model"]
    name = config["name"]
    params = config["params"]
    
    # Train
    model.fit(X_train, y_train)
    
    if name == 'MyDecisionTreeRegressor':
        if config['params']['pruned']:
            best_alpha = model.post_prune_with_cross_validation(X_train, y_train, n_splits=5)

    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Store results
    results.append({
        "model_name": name,
        "rsquared": r2,
        "rmse": rmse,
        "params": params
    })

# Create dataframe of the results
results_df = pd.DataFrame(results)

print(results_df)

                 model_name  rsquared           rmse  \
0         Linear Regression  0.012690  177683.581164   
1             Decision Tree  0.052981  174020.266609   
2  Random Forest (10 Trees)  0.026014  176480.546720   
3   MyDecisionTreeRegressor  0.059307  173438.010790   
4   MyDecisionTreeRegressor  0.059274  173441.053380   

                               params  
0                                  {}  
1                {'random_state': 42}  
2                {'random_state': 42}  
3   {'pruned': True, 'max_depth': 15}  
4  {'pruned': False, 'max_depth': 15}  


In [10]:
import numpy as np
import pandas as pd
import copy
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone



cv = KFold(n_splits=10, shuffle=True, random_state=42)

results_cv = []


for config in experiment_configs:
    name = config["name"]
    base_model = config["model"]
    params = config["params"]
    
    
    fold_r2 = []
    fold_rmse = []
    
    for train_idx, val_idx in cv.split(X_train):
        
        # Create folds
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
        
        if hasattr(base_model, 'get_params'):
             model = clone(base_model) # sklearn models
        else:
             model = copy.deepcopy(base_model) # custom models
        
        # Post-prune for the custom decision tree regressor
        if name == "MyDecisionTreeRegressor":
            model.fit(X_fold_train, y_fold_train)
            model.post_prune_with_cross_validation(X_fold_train, y_fold_train, n_splits=5)
        # Else just fit
        else:
            model.fit(X_fold_train, y_fold_train)
        
        # Predictions
        y_pred = model.predict(X_fold_val)
        
        # Calculate metrics
        fold_r2.append(r2_score(y_fold_val, y_pred))
        fold_rmse.append(np.sqrt(mean_squared_error(y_fold_val, y_pred)))
    
    # Store results
    results_cv.append({
        "model_name": name,
        "mean_r2": np.mean(fold_r2),
        "mean_rmse": np.mean(fold_rmse)
    })

# Create datafram
df_results = pd.DataFrame(results_cv)
print(df_results)

                 model_name   mean_r2     mean_rmse
0         Linear Regression  0.452197  41982.596433
1             Decision Tree  0.476268  40010.725821
2  Random Forest (10 Trees)  0.615425  35260.793260
3   MyDecisionTreeRegressor  0.569471  36950.860007
4   MyDecisionTreeRegressor  0.569471  36950.860007
