In [1]:
'''Imports'''
import numpy as np
import pandas as pd
# scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# regressors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
# CV
from sklearn.model_selection import  KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.base import clone
import warnings
import matplotlib.pyplot as plt
import re
randomstate = 42
warnings.filterwarnings('ignore')

In [3]:
'''Pre-Processing'''
data = pd.read_csv('health_insurance_train.csv')

# Numerical Data
num_clean = ["race", "region",]
num_data = data.drop(num_clean, axis=1)
num_data = num_data.replace({'yes': 1, 'no': 0})
num_data = num_data.replace('', np.nan)
def convert_edu_years(age_str):
    if pd.isna(age_str) or age_str == '':
        return np.nan
    age_str = str(age_str)
    pattern = r'(\d+)(?:-(\d+))?years'
    match = re.search(pattern, age_str)
    if match:
        if match.group(2):  # If there's a second number (range format)
            num1 = float(match.group(1))
            num2 = float(match.group(2))
            return (num1 + num2) / 2  # Return average
        else:  # Single number format
            return float(match.group(1))
    return np.nan
num_data['education'] = num_data['education'].apply(convert_edu_years)
num_data = num_data.dropna()

# Decision Tree
dec_data = data
dec_data = dec_data.drop("whrswk", axis=1)
desired_order = ['other', 'south', 'west', 'northcentral']
dec_data['region'] = pd.Categorical(dec_data['region'], categories=desired_order,ordered=True)
dec_data = pd.get_dummies(dec_data, columns=['hhi', 'whi', 'hhi2', 'hispanic', 'region'], drop_first=True, dtype=int)
dec_data = dec_data.replace('', np.nan)
dec_data['education'] = dec_data['education'].apply(convert_edu_years)
dec_data = dec_data.dropna()

stanscaler = StandardScaler()
minmaxscaler = MinMaxScaler()

y_num = num_data["whrswk"].to_numpy()
num_data = num_data.drop("whrswk", axis=1).to_numpy()
x_num = num_data
stanscaler.fit(x_num)
x_num_stan = stanscaler.transform(x_num)
minmaxscaler.fit(x_num)
x_num_minmax = minmaxscaler.transform(x_num)

x_dec = dec_data

x_num.shape


(4500, 9)

In [None]:
'''Cross Validation'''
def nested_cv(X, y, model, param_grid, outer_cv=5, inner_cv=5, model_name=""):
    """
    Nested cross-validation using MAE as scoring metric
    - First find best parameters using inner CV on entire dataset
    - Then use those same parameters in outer CV
    """
    outer_scores = []
    
    # Create MAE scorer (lower is better)
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    
    # Convert to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # STEP 1: Find best hyperparameters using inner CV on entire dataset
    if param_grid:
        print(f"{model_name} - Finding best parameters using inner CV...")
        inner_cv_splitter = KFold(n_splits=inner_cv, shuffle=True, random_state=42)
        
        grid_search = GridSearchCV(
            estimator=clone(model),
            param_grid=param_grid,
            cv=inner_cv_splitter,
            scoring=mae_scorer,
            n_jobs=-1
        )
        
        grid_search.fit(X, y)
        best_params = grid_search.best_params_
        print(f"{model_name} - Best parameters found: {best_params}")
    else:
        best_params = {}
    
    # STEP 2: Outer CV with fixed best parameters
    outer_cv = KFold(n_splits=outer_cv, shuffle=True, random_state=42)
    
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
        X_train_outer, X_test_outer = X[train_idx], X[test_idx]
        y_train_outer, y_test_outer = y[train_idx], y[test_idx]
        
        # Train model with the fixed best parameters
        current_model = clone(model)
        if best_params:
            current_model.set_params(**best_params)
        
        current_model.fit(X_train_outer, y_train_outer)
        y_pred = current_model.predict(X_test_outer)
        mae_score = mean_absolute_error(y_test_outer, y_pred)
        
        outer_scores.append(mae_score)
        
        print(f"{model_name} - Fold {fold + 1}: MAE = {mae_score:.4f}")
    
    return outer_scores, best_params

def simple_cv(X, y, model, outer_cv=5, model_name=""):
    """
    Simple cross-validation using MAE as scoring metric with fixed hyperparameters
    """
    outer_scores = []
    
    # Convert to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # Outer CV loop
    outer_cv = KFold(n_splits=outer_cv, shuffle=True, random_state=42)
    
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
        X_train_outer, X_test_outer = X[train_idx], X[test_idx]
        y_train_outer, y_test_outer = y[train_idx], y[test_idx]
        
        # Train model with fixed hyperparameters
        current_model = clone(model)
        current_model.fit(X_train_outer, y_train_outer)
        y_pred = current_model.predict(X_test_outer)
        mae_score = mean_absolute_error(y_test_outer, y_pred)
        
        outer_scores.append(mae_score)
        
        print(f"{model_name} - Fold {fold + 1}: MAE = {mae_score:.4f}")
    
    return outer_scores

def nested_cv_no_tuning(X, y, models_dict, outer_cv=5, random_state=42):
    """
    Nested cross-validation using MAE as scoring metric with predefined models
    No hyperparameter tuning - uses models with fixed parameters
    """
    results = {}
    
    # Convert to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # Outer CV splitter
    outer_cv_splitter = KFold(n_splits=outer_cv, shuffle=True, random_state=42)
    
    for model_name, model in models_dict.items():
        print(f"\n{model_name} - Starting outer CV...")
        outer_scores = []
        
        for fold, (train_idx, test_idx) in enumerate(outer_cv_splitter.split(X, y)):
            X_train_outer, X_test_outer = X[train_idx], X[test_idx]
            y_train_outer, y_test_outer = y[train_idx], y[test_idx]
            
            # Clone the model to ensure fresh instance for each fold
            current_model = clone(model)
            
            # Train model with predefined parameters
            current_model.fit(X_train_outer, y_train_outer)
            y_pred = current_model.predict(X_test_outer)
            
            mae_score = mean_absolute_error(y_test_outer, y_pred)
            outer_scores.append(mae_score)
            
            print(f"{model_name} - Fold {fold + 1}: MAE = {mae_score:.4f}")
        
        # Store results for this model
        results[model_name] = {
            'scores': outer_scores,
            'mean_mae': np.mean(outer_scores),
            'std_mae': np.std(outer_scores),
            'model': model
        }
        
        print(f"{model_name} - Average MAE: {np.mean(outer_scores):.4f} ± {np.std(outer_scores):.4f}")
    
    return results

In [6]:
models = {
    'DummyRegressor': {
        'model': DummyRegressor(strategy='median'),  # Can also try 'median'
        'param_grid': {}  # No hyperparameters to tune
    },
    'SGDRegressor': {
        'model': SGDRegressor(random_state=42),
        'param_grid': {
            'penalty': ['l2', 'l1', 'elasticnet'],
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'l1_ratio': [0.15, 0.3, 0.5, 0.7, 0.85],
            'learning_rate': ['constant', 'optimal', 'invscaling'],
            'eta0': [0.01, 0.1, 0.5],
            'max_iter': [1000, 2000]
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=42),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(random_state=42),
        'param_grid': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt', 'log2']
        }
    }
}

hp_models = {
    'DummyRegressor': DummyRegressor(strategy='median'),
    'SGDRegressor': SGDRegressor(
        penalty='l2', 
        alpha=0.001, 
        l1_ratio=0.15,
        learning_rate='constant',
        eta0=0.01,
        max_iter=1000,
        random_state=42
    ),
    'KNeighborsRegressor': KNeighborsRegressor(
        n_neighbors=5,
        weights='uniform',
        metric='euclidean'
    ),
    'RandomForestRegressor': RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    ),
    'DecisionTreeRegressor': DecisionTreeRegressor(
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )
}

STANDARD SCALING - TUNED PARAMETERS

In [7]:
results = {}

for model_name, model_config in models.items():
    print(f"\n{'='*50}")
    print(f"Evaluating {model_name}")
    print(f"{'='*50}")
    
    scores, params = nested_cv(
        x_num_stan, y_num, 
        model_config['model'], 
        model_config['param_grid'],
        outer_cv=5,
        inner_cv=3,
        model_name=model_name
    )
    
    results[model_name] = {
        'scores': scores,
        'params': params,
        'mean_mae': np.mean(scores),
        'std_mae': np.std(scores)
    }
    
    print(f"{model_name} - Mean MAE: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Print summary results
print(f"\n{'='*60}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*60}")

for model_name, result in results.items():
    print(f"{model_name:20} | MAE: {result['mean_mae']:.4f} ± {result['std_mae']:.4f}")

# Find best model (excluding dummy)
real_models = {k: v for k, v in results.items() if k != 'DummyRegressor'}
best_model_name = min(real_models.keys(), key=lambda x: real_models[x]['mean_mae'])
best_result = real_models[best_model_name]

print(f"\nBest Real Model: {best_model_name}")
print(f"Best MAE: {best_result['mean_mae']:.4f} ± {best_result['std_mae']:.4f}")

# Compare with dummy baseline
dummy_mae = results['DummyRegressor']['mean_mae']
print(f"\nDummy Regressor Baseline MAE: {dummy_mae:.4f}")
print(f"Improvement over dummy: {((dummy_mae - best_result['mean_mae']) / dummy_mae * 100):.2f}%")


Evaluating DummyRegressor
DummyRegressor - Fold 1: MAE = 15.9833
DummyRegressor - Fold 2: MAE = 16.1278
DummyRegressor - Fold 3: MAE = 16.1233
DummyRegressor - Fold 4: MAE = 16.1600
DummyRegressor - Fold 5: MAE = 15.0933
DummyRegressor - Mean MAE: 15.8976 ± 0.4067

Evaluating SGDRegressor
SGDRegressor - Finding best parameters using inner CV...
SGDRegressor - Best parameters found: {'alpha': 0.001, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'invscaling', 'max_iter': 1000, 'penalty': 'l1'}
SGDRegressor - Fold 1: MAE = 12.2887
SGDRegressor - Fold 2: MAE = 12.3402
SGDRegressor - Fold 3: MAE = 12.2013
SGDRegressor - Fold 4: MAE = 12.1837
SGDRegressor - Fold 5: MAE = 12.2572
SGDRegressor - Mean MAE: 12.2542 ± 0.0572

Evaluating KNeighborsRegressor
KNeighborsRegressor - Finding best parameters using inner CV...
KNeighborsRegressor - Best parameters found: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}
KNeighborsRegressor - Fold 1: MAE = 11.6068
KNeighborsRegressor - 

In [None]:
'''Plotter'''
def plot(x, y, title="Line Plot", xlabel="X", ylabel="Y"):
    plt.figure(figsize=(10, 6))
    plt.plot(x, y, alpha=0.7)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

MINMAX SCALING - TUNED PARAMETERS

In [8]:
results = {}

for model_name, model_config in models.items():
    print(f"\n{'='*50}")
    print(f"Evaluating {model_name}")
    print(f"{'='*50}")
    
    scores, params = nested_cv(
        x_num_minmax, y_num, 
        model_config['model'], 
        model_config['param_grid'],
        outer_cv=5,
        inner_cv=3,
        model_name=model_name
    )
    
    results[model_name] = {
        'scores': scores,
        'params': params,
        'mean_mae': np.mean(scores),
        'std_mae': np.std(scores)
    }
    
    print(f"{model_name} - Mean MAE: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Print summary results
print(f"\n{'='*60}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*60}")

for model_name, result in results.items():
    print(f"{model_name:20} | MAE: {result['mean_mae']:.4f} ± {result['std_mae']:.4f}")

# Find best model (excluding dummy)
real_models = {k: v for k, v in results.items() if k != 'DummyRegressor'}
best_model_name = min(real_models.keys(), key=lambda x: real_models[x]['mean_mae'])
best_result = real_models[best_model_name]

print(f"\nBest Real Model: {best_model_name}")
print(f"Best MAE: {best_result['mean_mae']:.4f} ± {best_result['std_mae']:.4f}")

# Compare with dummy baseline
dummy_mae = results['DummyRegressor']['mean_mae']
print(f"\nDummy Regressor Baseline MAE: {dummy_mae:.4f}")
print(f"Improvement over dummy: {((dummy_mae - best_result['mean_mae']) / dummy_mae * 100):.2f}%")


Evaluating DummyRegressor
DummyRegressor - Fold 1: MAE = 15.9833
DummyRegressor - Fold 2: MAE = 16.1278
DummyRegressor - Fold 3: MAE = 16.1233
DummyRegressor - Fold 4: MAE = 16.1600
DummyRegressor - Fold 5: MAE = 15.0933
DummyRegressor - Mean MAE: 15.8976 ± 0.4067

Evaluating SGDRegressor
SGDRegressor - Finding best parameters using inner CV...
SGDRegressor - Best parameters found: {'alpha': 0.0001, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'invscaling', 'max_iter': 1000, 'penalty': 'l1'}
SGDRegressor - Fold 1: MAE = 12.3291
SGDRegressor - Fold 2: MAE = 12.3211
SGDRegressor - Fold 3: MAE = 12.1288
SGDRegressor - Fold 4: MAE = 12.1639
SGDRegressor - Fold 5: MAE = 12.2686
SGDRegressor - Mean MAE: 12.2423 ± 0.0818

Evaluating KNeighborsRegressor
KNeighborsRegressor - Finding best parameters using inner CV...
KNeighborsRegressor - Best parameters found: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
KNeighborsRegressor - Fold 1: MAE = 11.6933
KNeighborsRegressor -

STANDARD SCALING - CHOSEN PARAMETERS

In [9]:
results = {}

# Run cross-validation for all models using the new function
cv_results = nested_cv_no_tuning(
    x_num_stan, y_num, 
    hp_models,
    outer_cv=5,
    random_state=42
)

# Process and display results
print(f"\n{'='*60}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*60}")

for model_name, result in cv_results.items():
    print(f"{model_name:20} | MAE: {result['mean_mae']:.4f} ± {result['std_mae']:.4f}")

# Store the detailed results
results = cv_results

# Find best model (excluding dummy)
real_models = {k: v for k, v in results.items() if k != 'DummyRegressor'}
best_model_name = min(real_models.keys(), key=lambda x: real_models[x]['mean_mae'])
best_result = real_models[best_model_name]

print(f"\nBest Real Model: {best_model_name}")
print(f"Best MAE: {best_result['mean_mae']:.4f} ± {best_result['std_mae']:.4f}")

# Compare with dummy baseline
dummy_mae = results['DummyRegressor']['mean_mae']
print(f"\nDummy Regressor Baseline MAE: {dummy_mae:.4f}")
print(f"Improvement over dummy: {((dummy_mae - best_result['mean_mae']) / dummy_mae * 100):.2f}%")


DummyRegressor - Starting outer CV...
DummyRegressor - Fold 1: MAE = 15.9833
DummyRegressor - Fold 2: MAE = 16.1278
DummyRegressor - Fold 3: MAE = 16.1233
DummyRegressor - Fold 4: MAE = 16.1600
DummyRegressor - Fold 5: MAE = 15.0933
DummyRegressor - Average MAE: 15.8976 ± 0.4067

SGDRegressor - Starting outer CV...
SGDRegressor - Fold 1: MAE = 12.6406
SGDRegressor - Fold 2: MAE = 12.5829
SGDRegressor - Fold 3: MAE = 12.0241
SGDRegressor - Fold 4: MAE = 12.6157
SGDRegressor - Fold 5: MAE = 12.9282
SGDRegressor - Average MAE: 12.5583 ± 0.2942

KNeighborsRegressor - Starting outer CV...
KNeighborsRegressor - Fold 1: MAE = 11.9193
KNeighborsRegressor - Fold 2: MAE = 11.9647
KNeighborsRegressor - Fold 3: MAE = 11.5840
KNeighborsRegressor - Fold 4: MAE = 12.0538
KNeighborsRegressor - Fold 5: MAE = 11.5587
KNeighborsRegressor - Average MAE: 11.8161 ± 0.2046

RandomForestRegressor - Starting outer CV...
RandomForestRegressor - Fold 1: MAE = 11.5255
RandomForestRegressor - Fold 2: MAE = 11.452

MINMAX SCALING - CHOSEN PARAMETERS

In [10]:
results = {}

# Run cross-validation for all models using the new function
cv_results = nested_cv_no_tuning(
    x_num_minmax, y_num, 
    hp_models,
    outer_cv=5,
    random_state=42
)

# Process and display results
print(f"\n{'='*60}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*60}")

for model_name, result in cv_results.items():
    print(f"{model_name:20} | MAE: {result['mean_mae']:.4f} ± {result['std_mae']:.4f}")

# Store the detailed results
results = cv_results

# Find best model (excluding dummy)
real_models = {k: v for k, v in results.items() if k != 'DummyRegressor'}
best_model_name = min(real_models.keys(), key=lambda x: real_models[x]['mean_mae'])
best_result = real_models[best_model_name]

print(f"\nBest Real Model: {best_model_name}")
print(f"Best MAE: {best_result['mean_mae']:.4f} ± {best_result['std_mae']:.4f}")

# Compare with dummy baseline
dummy_mae = results['DummyRegressor']['mean_mae']
print(f"\nDummy Regressor Baseline MAE: {dummy_mae:.4f}")
print(f"Improvement over dummy: {((dummy_mae - best_result['mean_mae']) / dummy_mae * 100):.2f}%")


DummyRegressor - Starting outer CV...
DummyRegressor - Fold 1: MAE = 15.9833
DummyRegressor - Fold 2: MAE = 16.1278
DummyRegressor - Fold 3: MAE = 16.1233
DummyRegressor - Fold 4: MAE = 16.1600
DummyRegressor - Fold 5: MAE = 15.0933
DummyRegressor - Average MAE: 15.8976 ± 0.4067

SGDRegressor - Starting outer CV...
SGDRegressor - Fold 1: MAE = 12.3968
SGDRegressor - Fold 2: MAE = 12.3582
SGDRegressor - Fold 3: MAE = 12.2589
SGDRegressor - Fold 4: MAE = 12.3894
SGDRegressor - Fold 5: MAE = 12.2609
SGDRegressor - Average MAE: 12.3329 ± 0.0609

KNeighborsRegressor - Starting outer CV...
KNeighborsRegressor - Fold 1: MAE = 11.9427
KNeighborsRegressor - Fold 2: MAE = 11.9551
KNeighborsRegressor - Fold 3: MAE = 11.5511
KNeighborsRegressor - Fold 4: MAE = 12.2478
KNeighborsRegressor - Fold 5: MAE = 11.7813
KNeighborsRegressor - Average MAE: 11.8956 ± 0.2287

RandomForestRegressor - Starting outer CV...
RandomForestRegressor - Fold 1: MAE = 11.5110
RandomForestRegressor - Fold 2: MAE = 11.465