In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [4]:
df=pd.read_csv('insurance.csv')

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
df['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [8]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [9]:
X=df.drop(columns=['charges'],axis=1)
y=df['charges']

In [10]:
numeric_features = X.select_dtypes(exclude=['object']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in categorical_features:
    X[col] = le.fit_transform(X[col])

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [14]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [15]:
models={
    "Linear Regression":LinearRegression(),
    "Ada boost":AdaBoostRegressor(),
    "Gradient boost":GradientBoostingRegressor(),
    "Random Forest":RandomForestRegressor(),
    "SVR":SVR(),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    "Lasso": Lasso(),
    "Ridge": Ridge()
}
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate performance
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)
    
    # Print results
    print(f"Model: {name}")
    print("Training set performance:")
    print(f"  MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}")
    print("Test set performance:")
    print(f"  MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}")
    print("="*40)




Model: Linear Regression
Training set performance:
  MAE: 4208.7620, RMSE: 6105.7893, R2: 0.7417
Test set performance:
  MAE: 4186.5089, RMSE: 5799.5871, R2: 0.7833
Model: Ada boost
Training set performance:
  MAE: 4080.1212, RMSE: 5073.6187, R2: 0.8217
Test set performance:
  MAE: 4234.0053, RMSE: 5203.3233, R2: 0.8256




Model: Gradient boost
Training set performance:
  MAE: 2101.3617, RMSE: 3836.0650, R2: 0.8980
Test set performance:
  MAE: 2448.3433, RMSE: 4358.6350, R2: 0.8776




Model: Random Forest
Training set performance:
  MAE: 1043.7739, RMSE: 1903.5861, R2: 0.9749
Test set performance:
  MAE: 2472.7742, RMSE: 4559.6840, R2: 0.8661




Model: SVR
Training set performance:
  MAE: 8253.1570, RMSE: 12580.6259, R2: -0.0966
Test set performance:
  MAE: 8607.9944, RMSE: 12898.6148, R2: -0.0717
Model: XGBoost
Training set performance:
  MAE: 499.3392, RMSE: 919.7656, R2: 0.9941
Test set performance:
  MAE: 2791.8325, RMSE: 4822.9912, R2: 0.8502
Model: Lasso
Training set performance:
  MAE: 4208.9377, RMSE: 6105.7905, R2: 0.7417
Test set performance:
  MAE: 4187.0149, RMSE: 5799.9428, R2: 0.7833
Model: Ridge
Training set performance:
  MAE: 4218.0223, RMSE: 6106.0360, R2: 0.7417
Test set performance:
  MAE: 4197.7039, RMSE: 5803.0863, R2: 0.7831




In [24]:
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

# --- Gradient Boosting ---
param_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# --- AdaBoost (use 'estimator' instead of 'base_estimator') ---
param_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.5, 1.0],
    'estimator': [DecisionTreeRegressor(max_depth=d) for d in [2, 3, 4]]
}

# --- XGBoost ---
param_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_lambda': [1, 1.5],
    'reg_alpha': [0, 0.1]
}


In [25]:
# --- Model Dictionary ---
randomcv_models = {
    "Gradient Boosting": (GradientBoostingRegressor(random_state=42), param_gb),
    "AdaBoost": (AdaBoostRegressor(random_state=42), param_ada),
    "XGBoost": (XGBRegressor(objective='reg:squarederror', random_state=42), param_xgb)
}

In [26]:
# --- Randomized Search ---
from sklearn.model_selection import RandomizedSearchCV

for name, (model, params) in randomcv_models.items():
    print(f"\nðŸ”¹ Tuning {name}...")
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=10,
        scoring='r2',
        cv=3,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    random_search.fit(X_train, y_train)
    print(f"Best R2 Score: {random_search.best_score_:.4f}")
    print("Best Params:", random_search.best_params_)


ðŸ”¹ Tuning Gradient Boosting...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best R2 Score: 0.8516
Best Params: {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 3, 'learning_rate': 0.05}

ðŸ”¹ Tuning AdaBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best R2 Score: 0.8320
Best Params: {'n_estimators': 50, 'learning_rate': 0.05, 'estimator': DecisionTreeRegressor(max_depth=3)}

ðŸ”¹ Tuning XGBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best R2 Score: 0.8476
Best Params: {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.8}


In [27]:
# --- Models with tuned hyperparameters ---
models = {
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.8,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    ),
    "AdaBoost": AdaBoostRegressor(
        n_estimators=50,
        learning_rate=0.05,
        estimator=DecisionTreeRegressor(max_depth=3),
        random_state=42
    ),
    "XGBoost": XGBRegressor(
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        colsample_bytree=0.8,
        reg_alpha=0,
        reg_lambda=1,
        objective='reg:squarederror',
        random_state=42
    )
}

# --- Evaluation loop ---
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate performance (assuming you have evaluate_model function)
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    # Print results
    print(f"\nðŸ”¹ {name}")
    print('Training set performance:')
    print(f"- MAE: {model_train_mae:.4f}")
    print(f"- RMSE: {model_train_rmse:.4f}")
    print(f"- R2: {model_train_r2:.4f}")
    
    print('Test set performance:')
    print(f"- MAE: {model_test_mae:.4f}")
    print(f"- RMSE: {model_test_rmse:.4f}")
    print(f"- R2: {model_test_r2:.4f}")
    print('='*50)





ðŸ”¹ Gradient Boosting
Training set performance:
- MAE: 2302.8152
- RMSE: 4087.5939
- R2: 0.8842
Test set performance:
- MAE: 2500.2447
- RMSE: 4348.0266
- R2: 0.8782





ðŸ”¹ AdaBoost
Training set performance:
- MAE: 3435.1014
- RMSE: 4629.9106
- R2: 0.8515
Test set performance:
- MAE: 3668.5607
- RMSE: 4925.1315
- R2: 0.8438

ðŸ”¹ XGBoost
Training set performance:
- MAE: 2181.4841
- RMSE: 3982.9656
- R2: 0.8901
Test set performance:
- MAE: 2411.2004
- RMSE: 4257.2020
- R2: 0.8833




## XGBoost is the best model here:

Highest test RÂ²

Lowest MAE and RMSE

Good generalization (train-test gap small)