In [61]:
'''Imports'''
import numpy as np
import pandas as pd
# scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# regressors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import  KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.base import clone
import warnings
import matplotlib.pyplot as plt
import re
randomstate = 42
warnings.filterwarnings('ignore')

In [83]:
'''Pre-Processing'''
data = pd.read_csv('health_insurance_train.csv')

# Numerical Data
num_clean = ["race", "region",]
num_data = data.drop(num_clean, axis=1)
num_data = num_data.replace({'yes': 1, 'no': 0})
num_data = num_data.replace('', np.nan)
def convert_edu_years(age_str):
    if pd.isna(age_str) or age_str == '':
        return np.nan
    age_str = str(age_str)
    pattern = r'(\d+)(?:-(\d+))?years'
    match = re.search(pattern, age_str)
    if match:
        if match.group(2):  # If there's a second number (range format)
            num1 = float(match.group(1))
            num2 = float(match.group(2))
            return (num1 + num2) / 2  # Return average
        else:  # Single number format
            return float(match.group(1))
    return np.nan
num_data['education'] = num_data['education'].apply(convert_edu_years)
num_data = num_data.dropna()

# Decision Tree
dec_data = data
dec_data = dec_data.drop("whrswk", axis=1)
desired_order = ['other', 'south', 'west', 'northcentral']
dec_data['region'] = pd.Categorical(dec_data['region'], categories=desired_order,ordered=True)
dec_data = pd.get_dummies(dec_data, columns=['hhi', 'whi', 'hhi2', 'hispanic', 'region'], drop_first=True, dtype=int)
dec_data = dec_data.replace('', np.nan)
dec_data['education'] = dec_data['education'].apply(convert_edu_years)
dec_data = dec_data.dropna()

stanscaler = StandardScaler()
minmaxscaler = MinMaxScaler()

y_num = num_data["whrswk"].to_numpy()
num_data = num_data.drop("whrswk", axis=1).to_numpy()
x_num = num_data
stanscaler.fit(x_num)
x_num_stan = stanscaler.transform(x_num)
minmaxscaler.fit(x_num)
x_num_minmax = minmaxscaler.transform(x_num)

x_dec = dec_data

x_num.shape


(4500, 9)

In [70]:
def nested_cv(X, y, model, param_grid, outer_cv=5, inner_cv=5, model_name=""):
    """
    Nested cross-validation using MAE as scoring metric
    """
    outer_scores = []
    best_params_list = []
    
    # Create MAE scorer (lower is better)
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    
    # Convert to numpy arrays to avoid pandas/sklearn compatibility issues
    X = np.array(X)
    y = np.array(y)
    
    # Outer CV loop
    outer_cv = KFold(n_splits=outer_cv, shuffle=True, random_state=42)
    
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
        X_train_outer, X_test_outer = X[train_idx], X[test_idx]
        y_train_outer, y_test_outer = y[train_idx], y[test_idx]
        
        # For DummyRegressor or models without hyperparameters, skip inner CV
        if model_name == "DummyRegressor" or not param_grid:
            current_model = clone(model)  # Use clone to avoid modifying original
            current_model.fit(X_train_outer, y_train_outer)
            y_pred = current_model.predict(X_test_outer)
            mae_score = mean_absolute_error(y_test_outer, y_pred)
            best_params = {}
        else:
            # Inner CV for hyperparameter tuning with MAE
            grid_search = GridSearchCV(
                estimator=clone(model),  # Use clone to avoid contamination
                param_grid=param_grid,
                cv=inner_cv,  # Pass integer directly
                scoring=mae_scorer,
                n_jobs=-1
                # Removed random_state parameter
            )
            
            # Fit on outer training fold
            grid_search.fit(X_train_outer, y_train_outer)
            
            # Get best model and evaluate on outer test fold using MAE
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(X_test_outer)
            mae_score = mean_absolute_error(y_test_outer, y_pred)
            best_params = grid_search.best_params_
        
        outer_scores.append(mae_score)
        best_params_list.append(best_params)
        
        print(f"{model_name} - Fold {fold + 1}: MAE = {mae_score:.4f}, Best params: {best_params}")
    
    return outer_scores, best_params_list

In [81]:
models = {
    'DummyRegressor': {
        'model': DummyRegressor(strategy='mean'),  # Can also try 'median'
        'param_grid': {}  # No hyperparameters to tune
    },
    'SGDRegressor': {
        'model': SGDRegressor(random_state=42),
        'param_grid': {
            'penalty': ['l2', 'l1', 'elasticnet'],
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'l1_ratio': [0.15, 0.3, 0.5, 0.7, 0.85],
            'learning_rate': ['constant', 'optimal', 'invscaling'],
            'eta0': [0.01, 0.1, 0.5],
            'max_iter': [1000, 2000]
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=42),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(random_state=42),
        'param_grid': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt', 'log2']
        }
    }
}

STANDARD SCALING

In [82]:
results = {}

for model_name, model_config in models.items():
    print(f"\n{'='*50}")
    print(f"Evaluating {model_name}")
    print(f"{'='*50}")
    
    scores, params = nested_cv(
        x_num_stan, y_num, 
        model_config['model'], 
        model_config['param_grid'],
        outer_cv=5,
        inner_cv=3,
        model_name=model_name
    )
    
    results[model_name] = {
        'scores': scores,
        'params': params,
        'mean_mae': np.mean(scores),
        'std_mae': np.std(scores)
    }
    
    print(f"{model_name} - Mean MAE: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Print summary results
print(f"\n{'='*60}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*60}")

for model_name, result in results.items():
    print(f"{model_name:20} | MAE: {result['mean_mae']:.4f} ± {result['std_mae']:.4f}")

# Find best model (excluding dummy)
real_models = {k: v for k, v in results.items() if k != 'DummyRegressor'}
best_model_name = min(real_models.keys(), key=lambda x: real_models[x]['mean_mae'])
best_result = real_models[best_model_name]

print(f"\nBest Real Model: {best_model_name}")
print(f"Best MAE: {best_result['mean_mae']:.4f} ± {best_result['std_mae']:.4f}")

# Compare with dummy baseline
dummy_mae = results['DummyRegressor']['mean_mae']
print(f"\nDummy Regressor Baseline MAE: {dummy_mae:.4f}")
print(f"Improvement over dummy: {((dummy_mae - best_result['mean_mae']) / dummy_mae * 100):.2f}%")

# Display best parameters for each model
print(f"\n{'='*60}")
print("BEST PARAMETERS FOR EACH MODEL")
print(f"{'='*60}")

for model_name, result in results.items():
    if model_name != 'DummyRegressor' and result['params']:
        print(f"\n{model_name}:")
        # Find the most frequent best parameters across folds
        from collections import Counter
        
        # Convert parameters to tuples for counting
        param_tuples = [tuple(sorted(params.items())) for params in result['params']]
        param_counts = Counter(param_tuples)
        
        # Get the most common parameter set
        most_common_params_tuple = param_counts.most_common(1)[0][0]
        most_common_params = dict(most_common_params_tuple)
        
        print(f"  Most frequent best parameters:")
        for param, value in most_common_params.items():
            print(f"    {param}: {value}")
        
        # Also show parameter frequency distribution
        print(f"  Parameter frequency: {param_counts.most_common()}")
    elif model_name == 'DummyRegressor':
        print(f"\n{model_name}: No tunable parameters")
    else:
        print(f"\n{model_name}: No hyperparameters tuned")

# Display the best parameters for the winning model specifically
print(f"\n{'='*60}")
print("BEST PARAMETERS FOR WINNING MODEL")
print(f"{'='*60}")
print(f"Model: {best_model_name}")

if best_result['params']:
    from collections import Counter
    
    param_tuples = [tuple(sorted(params.items())) for params in best_result['params']]
    param_counts = Counter(param_tuples)
    
    most_common_params_tuple = param_counts.most_common(1)[0][0]
    most_common_params = dict(most_common_params_tuple)
    
    print("Recommended parameters for final model:")
    for param, value in most_common_params.items():
        print(f"  {param}: {value}")
    
    # Show how consistent the parameter selection was
    consistency = param_counts.most_common(1)[0][1] / len(best_result['params']) * 100
    print(f"Parameter consistency across folds: {consistency:.1f}%")
else:
    print("No hyperparameters to tune for this model")


Evaluating DummyRegressor
DummyRegressor - Fold 1: MAE = 17.0528, Best params: {}
DummyRegressor - Fold 2: MAE = 16.7509, Best params: {}
DummyRegressor - Fold 3: MAE = 17.0202, Best params: {}
DummyRegressor - Fold 4: MAE = 17.3073, Best params: {}
DummyRegressor - Fold 5: MAE = 16.6741, Best params: {}
DummyRegressor - Mean MAE: 16.9611 ± 0.2273

Evaluating SGDRegressor
SGDRegressor - Fold 1: MAE = 12.5225, Best params: {'alpha': 0.001, 'eta0': 0.1, 'l1_ratio': 0.5, 'learning_rate': 'invscaling', 'max_iter': 1000, 'penalty': 'elasticnet'}
SGDRegressor - Fold 2: MAE = 12.3470, Best params: {'alpha': 0.1, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'max_iter': 1000, 'penalty': 'l1'}
SGDRegressor - Fold 3: MAE = 12.2013, Best params: {'alpha': 0.001, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'invscaling', 'max_iter': 1000, 'penalty': 'l1'}
SGDRegressor - Fold 4: MAE = 12.2181, Best params: {'alpha': 0.01, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'optimal'

In [None]:
'''Plotter'''
def plot(x, y, title="Line Plot", xlabel="X", ylabel="Y"):
    plt.figure(figsize=(10, 6))
    plt.plot(x, y, alpha=0.7)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

MINMAX SCALING

In [84]:
results = {}

for model_name, model_config in models.items():
    print(f"\n{'='*50}")
    print(f"Evaluating {model_name}")
    print(f"{'='*50}")
    
    scores, params = nested_cv(
        x_num_minmax, y_num, 
        model_config['model'], 
        model_config['param_grid'],
        outer_cv=5,
        inner_cv=3,
        model_name=model_name
    )
    
    results[model_name] = {
        'scores': scores,
        'params': params,
        'mean_mae': np.mean(scores),
        'std_mae': np.std(scores)
    }
    
    print(f"{model_name} - Mean MAE: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Print summary results
print(f"\n{'='*60}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*60}")

for model_name, result in results.items():
    print(f"{model_name:20} | MAE: {result['mean_mae']:.4f} ± {result['std_mae']:.4f}")

# Find best model (excluding dummy)
real_models = {k: v for k, v in results.items() if k != 'DummyRegressor'}
best_model_name = min(real_models.keys(), key=lambda x: real_models[x]['mean_mae'])
best_result = real_models[best_model_name]

print(f"\nBest Real Model: {best_model_name}")
print(f"Best MAE: {best_result['mean_mae']:.4f} ± {best_result['std_mae']:.4f}")

# Compare with dummy baseline
dummy_mae = results['DummyRegressor']['mean_mae']
print(f"\nDummy Regressor Baseline MAE: {dummy_mae:.4f}")
print(f"Improvement over dummy: {((dummy_mae - best_result['mean_mae']) / dummy_mae * 100):.2f}%")

# Display best parameters for each model
print(f"\n{'='*60}")
print("BEST PARAMETERS FOR EACH MODEL")
print(f"{'='*60}")

for model_name, result in results.items():
    if model_name != 'DummyRegressor' and result['params']:
        print(f"\n{model_name}:")
        # Find the most frequent best parameters across folds
        from collections import Counter
        
        # Convert parameters to tuples for counting
        param_tuples = [tuple(sorted(params.items())) for params in result['params']]
        param_counts = Counter(param_tuples)
        
        # Get the most common parameter set
        most_common_params_tuple = param_counts.most_common(1)[0][0]
        most_common_params = dict(most_common_params_tuple)
        
        print(f"  Most frequent best parameters:")
        for param, value in most_common_params.items():
            print(f"    {param}: {value}")
        
        # Also show parameter frequency distribution
        print(f"  Parameter frequency: {param_counts.most_common()}")
    elif model_name == 'DummyRegressor':
        print(f"\n{model_name}: No tunable parameters")
    else:
        print(f"\n{model_name}: No hyperparameters tuned")

# Display the best parameters for the winning model specifically
print(f"\n{'='*60}")
print("BEST PARAMETERS FOR WINNING MODEL")
print(f"{'='*60}")
print(f"Model: {best_model_name}")

if best_result['params']:
    from collections import Counter
    
    param_tuples = [tuple(sorted(params.items())) for params in best_result['params']]
    param_counts = Counter(param_tuples)
    
    most_common_params_tuple = param_counts.most_common(1)[0][0]
    most_common_params = dict(most_common_params_tuple)
    
    print("Recommended parameters for final model:")
    for param, value in most_common_params.items():
        print(f"  {param}: {value}")
    
    # Show how consistent the parameter selection was
    consistency = param_counts.most_common(1)[0][1] / len(best_result['params']) * 100
    print(f"Parameter consistency across folds: {consistency:.1f}%")
else:
    print("No hyperparameters to tune for this model")


Evaluating DummyRegressor
DummyRegressor - Fold 1: MAE = 17.0528, Best params: {}
DummyRegressor - Fold 2: MAE = 16.7509, Best params: {}
DummyRegressor - Fold 3: MAE = 17.0202, Best params: {}
DummyRegressor - Fold 4: MAE = 17.3073, Best params: {}
DummyRegressor - Fold 5: MAE = 16.6741, Best params: {}
DummyRegressor - Mean MAE: 16.9611 ± 0.2273

Evaluating SGDRegressor
SGDRegressor - Fold 1: MAE = 12.4010, Best params: {'alpha': 0.01, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'constant', 'max_iter': 1000, 'penalty': 'l1'}
SGDRegressor - Fold 2: MAE = 12.2894, Best params: {'alpha': 0.0001, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'max_iter': 1000, 'penalty': 'elasticnet'}
SGDRegressor - Fold 3: MAE = 12.1288, Best params: {'alpha': 0.0001, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'invscaling', 'max_iter': 1000, 'penalty': 'l1'}
SGDRegressor - Fold 4: MAE = 12.1639, Best params: {'alpha': 0.0001, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'inv