In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_excel('insurance.xlsx')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.shape

(1338, 7)

In [5]:
continuous=[]
categorical=[]
check=[]

d_types=dict(df.dtypes)
for name,type in d_types.items():
    if str(type)=='float64':
        continuous.append(name)
    elif str(type)=='object':
        categorical.append(name)
    else:
        check.append(name)
        
print('continuous features:',continuous)
print('categorical features:',categorical)
print('features to be checked:',check)

continuous features: ['bmi', 'expenses']
categorical features: ['sex', 'smoker', 'region']
features to be checked: ['age', 'children']


In [6]:
d_types=dict(df.dtypes)
for name,type in d_types.items():
    if str(type)=='object':
        print(f'<======={name}=======>')
        print(df[name].value_counts())
    

male      676
female    662
Name: sex, dtype: int64
no     1064
yes     274
Name: smoker, dtype: int64
southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


In [7]:
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [8]:
df.corr()

Unnamed: 0,age,bmi,children,expenses
age,1.0,0.109341,0.042469,0.299008
bmi,0.109341,1.0,0.012645,0.198576
children,0.042469,0.012645,1.0,0.067998
expenses,0.299008,0.198576,0.067998,1.0


In [9]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [10]:
#feature scaling drop region
df.drop('region',axis=1,inplace=True)

In [11]:
#encoding sex & smoker columns
df['sex'].replace({'female':0,'male':1},inplace=True)
df['smoker'].replace({'no':0,'yes':1},inplace=True)

In [12]:
x=df.drop('expenses',axis=1)
y=df['expenses']

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=9)

In [14]:
from sklearn.linear_model import Lasso
lasso_base=Lasso()
lasso_base.fit(x_train,y_train)

Lasso()

In [15]:
train_predictions=lasso_base.predict(x_train)
test_predictions=lasso_base.predict(x_test)

In [16]:
print('train r2:',lasso_base.score(x_train,y_train))
print('test r2:',lasso_base.score(x_test,y_test))

train r2: 0.7433161675495346
test r2: 0.7757553862206219


In [17]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(lasso_base,x,y,cv=5)
print(scores)
scores.mean()

[0.76099933 0.70877445 0.77558184 0.73142165 0.75655517]


0.7466664879428851

In [18]:
#hyperparameter tuning

from sklearn.model_selection import GridSearchCV

estimator=Lasso()
param_grid={'alpha':[0.1,0.2,0.5,0.8,1,1.4,1.7,2,10,30,50,100]}

model_hp=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model_hp.fit(x_train,y_train)
model_hp.best_params_

{'alpha': 50}

In [19]:
from sklearn.linear_model import Lasso
lasso_best=Lasso(alpha=50)
lasso_best.fit(x_train,y_train)

train_predictions=lasso_best.predict(x_train)
test_predictions=lasso_best.predict(x_test)

print('train accuracy:',lasso_best.score(x_train,y_train))
print('test accuracy:',lasso_best.score(x_test,y_test))

print('intercept:',lasso_best.intercept_)
print('coefficiant:',lasso_best.coef_)

scores=cross_val_score(lasso_best,x,y,cv=5)
print(scores)
scores.mean()

train accuracy: 0.743200050042114
test accuracy: 0.7747832866417829
intercept: -11336.591469036648
coefficiant: [  256.68842008    -0.           303.91129506   408.88291866
 23323.3694514 ]
[0.76185274 0.70979909 0.774588   0.73200628 0.75598049]


0.7468453212720682

In [20]:
x=x.drop(x.columns[[1]],axis=1)
y=df['expenses']

In [21]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=9)

In [22]:
from sklearn.linear_model import Lasso
lasso_best=Lasso(alpha=50)
lasso_best.fit(x_train,y_train)

train_predictions=lasso_best.predict(x_train)
test_predictions=lasso_best.predict(x_test)

print('train r2:',lasso_best.score(x_train,y_train))
print('test r2:',lasso_best.score(x_test,y_test))

scores=cross_val_score(lasso_best,x,y,cv=5)
scores.mean()

train r2: 0.743200076877043
test r2: 0.7747836916908193


0.746863545564871

**prediction on new data**

In [23]:
input_data={'age':31,'sex':'female','bmi':25.74,'children':0,'smoker':'no','region':'northeast'}

In [29]:
df_test=pd.DataFrame(input_data,index=[0])

In [30]:
df_test.drop('region',axis=1,inplace=True)
df_test['sex'].replace({'female':0,'male':1},inplace=True)
df_test['smoker'].replace({'no':0,'yes':1},inplace=True)

In [33]:
transformed_data=df_test.drop(df_test.columns[[1]],axis=1)

In [34]:
lasso_best.predict(transformed_data)

array([4443.03401097])