In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [6]:
df.shape

(1338, 7)

In [7]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [8]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
X = df.drop('charges', axis=1) 
y = df['charges']

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
one_hot_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [19]:
from sklearn.compose import ColumnTransformer

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_transformer, categorical_features),
    ],
    remainder='passthrough' 
)

In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [25]:
one_hot_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [27]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [29]:
from sklearn.model_selection import GridSearchCV

In [31]:
param_grid = {
    "criterion" : ["squared_error", "friedman_mse"], 
    "splitter" : ["best", "random"],
    "max_depth" : [5, 10, 15, None], 
    "min_samples_split" : [2, 10, 50], 
    "min_samples_leaf": [1, 5, 10] 
}

In [33]:
from sklearn.tree import DecisionTreeRegressor 

In [35]:
dt_regressor = DecisionTreeRegressor(random_state=15)

In [37]:
grid = GridSearchCV(estimator = dt_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [39]:
grid.fit(X_train_transformed, y_train)

In [40]:
grid.best_params_

{'criterion': 'squared_error',
 'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'splitter': 'best'}

In [41]:
best_dt_model = grid.best_estimator_

In [42]:
y_pred = best_dt_model.predict(X_test_transformed)

In [43]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 

In [53]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 2629.9597462512406
MSE: 20530675.395982552
RMSE: 4531.078833565198
R2 Score: 0.8599776824808116
