In [None]:
'''Imports'''
import numpy as np
import pandas as pd
# scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# regressors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor

from sklearn.model_selection import  KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
import warnings
import matplotlib.pyplot as plt
import re
randomstate = 42
warnings.filterwarnings('ignore')

In [None]:
'''Hyperparameters'''
hp_nearest_neighbours = 20
hp_alpha = 0.01
hp_learning_rate = 0.001
hp_epochs = 30

In [None]:
'''Pre-Processing'''
data = pd.read_csv('health_insurance_train.csv')

# Numerical Data
num_clean = ["race", "region", "whrswk"]
num_data = data.drop(num_clean, axis=1)
num_data = num_data.replace({'yes': 1, 'no': 0})
num_data = num_data.replace('', np.nan)
def convert_edu_years(age_str):
    if pd.isna(age_str) or age_str == '':
        return np.nan
    age_str = str(age_str)
    pattern = r'(\d+)(?:-(\d+))?years'
    match = re.search(pattern, age_str)
    if match:
        if match.group(2):  # If there's a second number (range format)
            num1 = float(match.group(1))
            num2 = float(match.group(2))
            return (num1 + num2) / 2  # Return average
        else:  # Single number format
            return float(match.group(1))
    return np.nan
num_data['education'] = num_data['education'].apply(convert_edu_years)
num_data = num_data.dropna().to_numpy()

# Decision Tree
dec_data = data
dec_data = dec_data.drop("whrswk", axis=1)
desired_order = ['other', 'south', 'west', 'northcentral']
dec_data['region'] = pd.Categorical(dec_data['region'], categories=desired_order,ordered=True)
dec_data = pd.get_dummies(dec_data, columns=['hhi', 'whi', 'hhi2', 'hispanic', 'region'], drop_first=True, dtype=int)
dec_data = dec_data.replace('', np.nan)
dec_data['education'] = dec_data['education'].apply(convert_edu_years)
dec_data = dec_data.dropna()

stanscaler = StandardScaler()
minmaxscaler = MinMaxScaler()

y = data["whrswk"].to_numpy()
x_num_stan = stanscaler.transform(num_data)
x_num_minmax = minmaxscaler.transform(num_data)
x_dec = dec_data

  num_data = num_data.replace({'yes': 1, 'no': 0})


In [29]:
def nested_cv(X, y, model, param_grid, outer_cv=5, inner_cv=5):

    outer_scores = []
    best_params_list = []
    
    # MAE scorer
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    
    # Outer CV loop
    outer_cv = KFold(n_splits=outer_cv, shuffle=True, random_state=randomstate)
    
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
        X_train_outer, X_test_outer = X[train_idx], X[test_idx]
        y_train_outer, y_test_outer = y[train_idx], y[test_idx]
        
        # Inner CV for hyperparameter tuning with MAE
        inner_cv = KFold(n_splits=inner_cv, shuffle=True, random_state=42)
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=inner_cv,
            scoring=mae_scorer,  # Use MAE for hyperparameter tuning
            n_jobs=-1
        )
        
        # Fit on outer training fold
        grid_search.fit(X_train_outer, y_train_outer)
        
        # Get best model and evaluate on outer test fold using MAE
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_outer)
        mae_score = mean_absolute_error(y_test_outer, y_pred)
        
        outer_scores.append(mae_score)
        best_params_list.append(grid_search.best_params_)
        
        print(f"Fold {fold + 1}: MAE = {mae_score:.4f}, Best params: {grid_search.best_params_}")
    
    return outer_scores, best_params_list

In [None]:
models = {
    'DummyRegressor': {
        'model': DummyRegressor(strategy='mean'),
        'param_grid': {}
    },
    'SGDRegressor': {
        'model': SGDRegressor(random_state=42),
        'param_grid': {
            'penalty': ['l2', 'l1', 'elasticnet'],
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'l1_ratio': [0.15, 0.3, 0.5, 0.7, 0.85],
            'learning_rate': ['constant', 'optimal', 'invscaling'],
            'eta0': [0.01, 0.1, 0.5],
            'max_iter': [1000, 2000]
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=randomstate),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(random_state=randomstate),
        'param_grid': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['none', 'sqrt', 'log2']
        }
    }
}

In [None]:
results = {}

for model_name, model_config in models.items():
    print(f"\n{'='*50}")
    print(f"Evaluating {model_name}")
    print(f"{'='*50}")
    
    scores, params = nested_cv_mae(
        X, y, 
        model_config['model'], 
        model_config['param_grid'],
        outer_cv=5,
        inner_cv=3,
        model_name=model_name
    )
    
    results[model_name] = {
        'scores': scores,
        'params': params,
        'mean_mae': np.mean(scores),
        'std_mae': np.std(scores)
    }
    
    print(f"{model_name} - Mean MAE: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Print summary results
print(f"\n{'='*60}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*60}")

for model_name, result in results.items():
    print(f"{model_name:20} | MAE: {result['mean_mae']:.4f} ± {result['std_mae']:.4f}")

# Find best model (excluding dummy)
real_models = {k: v for k, v in results.items() if k != 'DummyRegressor'}
best_model_name = min(real_models.keys(), key=lambda x: real_models[x]['mean_mae'])
best_result = real_models[best_model_name]

print(f"\nBest Real Model: {best_model_name}")
print(f"Best MAE: {best_result['mean_mae']:.4f} ± {best_result['std_mae']:.4f}")

# Compare with dummy baseline
dummy_mae = results['DummyRegressor']['mean_mae']
print(f"\nDummy Regressor Baseline MAE: {dummy_mae:.4f}")
print(f"Improvement over dummy: {((dummy_mae - best_result['mean_mae']) / dummy_mae * 100):.2f}%")

In [None]:
'''Plotter'''
def plot(x, y, title="Line Plot", xlabel="X", ylabel="Y"):
    plt.figure(figsize=(10, 6))
    plt.plot(x, y, alpha=0.7)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()