In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [4]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [5]:
df.shape

(1338, 7)

In [6]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.665471,6.098382,16.0,26.3,30.4,34.7,53.1
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
expenses,1338.0,13270.422414,12110.01124,1121.87,4740.2875,9382.03,16639.915,63770.43


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [9]:
df.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.665471,1.094918,,,13270.422414
std,14.04996,,6.098382,1.205493,,,12110.01124
min,18.0,,16.0,0.0,,,1121.87
25%,27.0,,26.3,0.0,,,4740.2875
50%,39.0,,30.4,1.0,,,9382.03
75%,51.0,,34.7,2.0,,,16639.915


In [10]:
df.shape

(1338, 7)

In [12]:
df=df.drop_dublicate()

AttributeError: 'DataFrame' object has no attribute 'drop_dublicate'

In [7]:
df["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [8]:
one_hot_encoded = pd.get_dummies(df,columns=["sex","smoker","region"],drop_first=True).astype(float)
one_hot_encoded.head()

Unnamed: 0,age,bmi,children,expenses,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19.0,27.9,0.0,16884.92,0.0,1.0,0.0,0.0,1.0
1,18.0,33.8,1.0,1725.55,1.0,0.0,0.0,1.0,0.0
2,28.0,33.0,3.0,4449.46,1.0,0.0,0.0,1.0,0.0
3,33.0,22.7,0.0,21984.47,1.0,0.0,1.0,0.0,0.0
4,32.0,28.9,0.0,3866.86,1.0,0.0,1.0,0.0,0.0


In [9]:
from sklearn.model_selection import train_test_split ,GridSearchCV
X = one_hot_encoded.drop(columns=["expenses"],axis=1)
y = one_hot_encoded.expenses

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1)

In [10]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [11]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn_param = {'n_neighbors': [3,5,7,9,11,13,15,17,19,21],
              'weights': ['uniform', 'distance']
            } 
grid_search = GridSearchCV(knn,param_grid=knn_param,cv=5,scoring="neg_mean_squared_error")

grid_search.fit(X_train,y_train)
print(grid_search.best_params_)

{'n_neighbors': 11, 'weights': 'distance'}


In [12]:
knn_tuned = KNeighborsRegressor(n_neighbors=13,weights="distance").fit(X_train,y_train)
y_train_pred = knn_tuned.predict(X_train)
print(f"train mae : {mean_absolute_error(y_train,y_train_pred)}")
print(f"train mse : {mean_squared_error(y_train,y_train_pred)}")
print(f"train r2 score : {r2_score(y_train,y_train_pred)}")

y_test_pred = knn_tuned.predict(X_test)
print(f"\ntest mae : {mean_absolute_error(y_test,y_test_pred)}")
print(f"test mse : {mean_squared_error(y_test,y_test_pred)}")
print(f"test r2 score : {r2_score(y_test,y_test_pred)}")

print("Overfitting !!!!!!!")

train mae : 10.383073089700998
train mse : 64900.54048521596
train r2 score : 0.9995618723764925

test mae : 8266.178218830617
test mse : 122280649.1860246
test r2 score : 0.07150876998092837
Overfitting !!!!!!!


In [13]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor().fit(X_train,y_train)

y_pred = rf_reg.predict(X_test)
print(f"\ntest mae : {mean_absolute_error(y_test,y_pred)}")
print(f"test mse : {mean_squared_error(y_test,y_pred)}")
print(f"test r2 score : {r2_score(y_test,y_pred)}")


test mae : 2437.361663432836
test mse : 18945500.32303699
test r2 score : 0.8561446065640141


In [14]:
from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train,y_train)

y_pred = lasso.predict(X_test)
print(f"\ntest mae : {mean_absolute_error(y_test,y_pred)}")
print(f"test mse : {mean_squared_error(y_test,y_pred)}")
print(f"test r2 score : {r2_score(y_test,y_pred)}")


test mae : 4106.226548030701
test mse : 31459029.883189216
test r2 score : 0.7611279172470453


In [15]:
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train,y_train)

y_pred = ridge.predict(X_test)
print(f"\ntest mae : {mean_absolute_error(y_test,y_pred)}")
print(f"test mse : {mean_squared_error(y_test,y_pred)}")
print(f"test r2 score : {r2_score(y_test,y_pred)}")


test mae : 4118.535536229591
test mse : 31495265.25829007
test r2 score : 0.7608527778180224


In [16]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet().fit(X_train,y_train)

y_pred = elastic_net.predict(X_test)
print(f"\ntest mae : {mean_absolute_error(y_test,y_pred)}")
print(f"test mse : {mean_squared_error(y_test,y_pred)}")
print(f"test r2 score : {r2_score(y_test,y_pred)}")


test mae : 7496.064673555937
test mse : 86938831.27883069
test r2 score : 0.3398633150229651
