In [59]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [60]:
# Load dataset
df = pd.read_csv('insurance.csv')

In [61]:
# Categorize BMI
df['bmi'] = np.where(df.bmi < 18.5, 'Underweight',
            np.where(df.bmi < 25, 'Normal weight',
            np.where(df.bmi < 30, 'Overweight', 'Obese')))

In [62]:
# One-Hot Encoding
df = pd.get_dummies(df)

In [63]:
# Features and Target 
X = df.drop('expenses', axis = 1)
Y = df['expenses']

In [64]:
# Lists to try different test sizes and random states
test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = range(1,50)

In [65]:
# Variables to track best models
best_r2 = -1
best_r2_model = None
best_r2_details = None

In [66]:
best_mse = float('inf')
best_mse_model = None
best_mse_details = None

In [67]:
# Model training and evaluation
for test_size in test_sizes:
    for random_state in random_states:
        # Split data
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)
        # Build and train the model
        model = LinearRegression()
        model.fit(X_train, Y_train)

        # Predict
        Y_pred = model.predict(X_test)

        # Evaluate
        r2 = r2_score(Y_test, Y_pred) 
        mse = mean_squared_error(Y_test, Y_pred)

        # Update best R2
        if r2 > best_r2:
            best_r2 = r2
            best_r2_model = model
            best_r2_details = (test_size, random_state)

        # Update best MSE
        if mse < best_mse:
            best_mse = mse
            best_mse_model = model
            best_mse_details = (test_size, random_state)

In [68]:
# -----------------------------------------------------------------------------
# Show best results
print("\n ========================== Best Results ==========================")
print(f"Maximum R2 Score: {best_r2:.4f} acheived at Test size = {best_r2_details[0]}, Random State = {best_r2_details[1]}")
print(f"Minimum MSE: {best_mse:.2f} acheived at Test size = {best_mse_details[0]}, Random State = {best_mse_details[1]}")


Maximum R2 Score: 0.8371 acheived at Test size = 0.1, Random State = 47
Minimum MSE: 25552118.79 acheived at Test size = 0.1, Random State = 11


In [69]:
# -----------------------------------------------------------------------------
# Now predict for new person using best models

# Create input data
new_data = {
    'age': [34],
    'children': [3],
    'sex_female': [0],
    'sex_male': [1],
    'bmi_Normal weight': [0],
    'bmi_Obese': [0],
    'bmi_Overweight': [1], # BMI=26 --> Overweight
    'bmi_Underweight': [0],
    'smoker_no': [0],
    'smoker_yes': [1],
    'region_northeast': [0],
    'region_northwest': [0],
    'region_southeast': [0],
    'region_southwest': [1]
}

In [70]:
# Convert to dataframe
new_df = pd.DataFrame(new_data)

In [71]:
# Make sure columns match training data
missing_cols = set(X.columns) - set(new_df.columns)
for col in missing_cols:
    new_df[col] = 0 # Add missing columns with 0

In [72]:
# Reorder columns to match training set
new_df = new_df[X.columns]

In [73]:
# Predict using best R2 model
predicted_charger_r2 = best_r2_model.predict(new_df)
print("\nPredicted insurance charges using best R2 Model: ${:.2f}".format(predicted_charger_r2[0]))


Predicted insurance charges using best R2 Model: $29273.45


In [74]:
# Predict using best MSE model
predicted_charge_mse = best_mse_model.predict(new_df)
print("Predicted Insurance Charge using Best MSE Model: ${:.2f}".format(predicted_charge_mse[0]))

Predicted Insurance Charge using Best MSE Model: $29128.00
