# Step 1: Set Up Your Environment
Prepare our workspace with the Medical Insurance Dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.datasets import fetch_openml

# Load the dataset
dataset = fetch_openml(data_id=45064)  # Medical Insurance Dataset
X, y = dataset.data, dataset.target

# Prepare features and target

# Handle categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nDataset Information:")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
print(f"Features: {X_train.shape[1]}")
print("\nFeatures included:")
print(X.columns.tolist())


Dataset Information:
Training samples: 18838
Testing samples: 4710
Features: 66

Features included:
['Upper_Age', 'Lower_Age', 'Reco_Policy_Premium', 'City_Code_C10', 'City_Code_C11', 'City_Code_C12', 'City_Code_C13', 'City_Code_C14', 'City_Code_C15', 'City_Code_C16', 'City_Code_C17', 'City_Code_C18', 'City_Code_C19', 'City_Code_C2', 'City_Code_C20', 'City_Code_C21', 'City_Code_C22', 'City_Code_C23', 'City_Code_C24', 'City_Code_C25', 'City_Code_C26', 'City_Code_C27', 'City_Code_C28', 'City_Code_C29', 'City_Code_C3', 'City_Code_C30', 'City_Code_C31', 'City_Code_C32', 'City_Code_C33', 'City_Code_C34', 'City_Code_C35', 'City_Code_C36', 'City_Code_C4', 'City_Code_C5', 'City_Code_C6', 'City_Code_C7', 'City_Code_C8', 'City_Code_C9', 'Accomodation_Type_Rented', 'Reco_Insurance_Type_Joint', 'Is_Spouse_Yes', 'Health Indicator_X2', 'Health Indicator_X3', 'Health Indicator_X4', 'Health Indicator_X5', 'Health Indicator_X6', 'Health Indicator_X7', 'Health Indicator_X8', 'Health Indicator_X9', 'Hol

# Step 2: Initialize Your Ensemble Models
Set up Random Forest and Gradient Boosting models.

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=42
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42
    )
}

print("\nInitialized Models:")
for name in models:
    print(f"- {name}")


Initialized Models:
- Random Forest
- Gradient Boosting


# Step 3: Perform Cross-Validation
Evaluate models using k-fold cross-validation.

In [None]:
# Set up cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation for each model
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(
        model, X_train, y_train,
        cv=kf,
        scoring='neg_mean_squared_error'
    )
    cv_results[name] = -scores  # Convert to positive MSE

    print(f"\n{name} Cross-Validation Results:")
    print(f"Mean MSE: {np.mean(cv_results[name]):.2f}")
    print(f"Std MSE: {np.std(cv_results[name]):.2f}")


Random Forest Cross-Validation Results:
Mean MSE: 0.26
Std MSE: 0.00

Gradient Boosting Cross-Validation Results:
Mean MSE: 0.24
Std MSE: 0.00


### Step 4: Interpret and Decide

In [None]:
models['Random Forest'].fit(X_train, y_train)
models['Gradient Boosting'].fit(X_train, y_train)

y_hat_test_rf = models['Random Forest'].predict(X_test)
y_hat_test_gb = models['Gradient Boosting'].predict(X_test)

rf_acc = accuracy_score(y_test, y_hat_test_rf)
gb_acc = accuracy_score(y_test, y_hat_test_gb)

print(f"\nRandom Forest Test Accuracy: {rf_acc:.2f}")
print(f"Gradient Boosting Test Accuracy: {gb_acc:.2f}")


Random Forest Test Accuracy: 0.75
Gradient Boosting Test Accuracy: 0.76
