## Choose a cluster dataset of your choice and perform the following.

## 1. Clean and prepare the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=pd.read_csv("insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [5]:
df.index

RangeIndex(start=0, stop=1338, step=1)

In [6]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
print(df['region'].unique())

['southwest' 'southeast' 'northwest' 'northeast']


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [11]:
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])

## 2. Split the dataset in the ratio 75;25 for train-test split

In [12]:
X=df[['age','sex','bmi','children','smoker','region']]
y=df['charges']

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=101)

In [15]:
# Check train and test data shapes 
print(X_train.shape) 
print(X_test.shape) 
print(y_train.shape) 
print(y_test.shape)

(1003, 6)
(335, 6)
(1003,)
(335,)


## 3. Build linear regression model, polynomial model, lasso model and ridge model

## linear regression model

In [16]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [17]:
lr.fit(X_train,y_train)

LinearRegression()

In [18]:
lr_pred = lr.predict(X_test)

## polynomial model

In [19]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly_train = poly_reg.fit_transform(X_train)
X_poly_test = poly_reg.fit_transform(X_test)

In [20]:
poly = LinearRegression()
poly.fit(X_poly_train, y_train)

LinearRegression()

In [21]:
poly_pred = poly.predict(X_poly_test)

## lasso model

In [23]:
from sklearn.linear_model import Lasso
l = Lasso()

In [24]:
l.fit(X_train,y_train)

Lasso()

In [25]:
l_pred = l.predict(X_test)

## ridge model

In [26]:
from sklearn.linear_model import Ridge
r = Ridge()

In [27]:
r.fit(X_train,y_train)

Ridge()

In [28]:
r_pred = r.predict(X_test)

## 4. Print the MSE, RMSE and R 2 for each model

In [29]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## linear regression model

In [33]:
print('Mean Squared Error:',mean_squared_error(y_test,lr_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,lr_pred)))
print('R2:',r2_score(y_test,lr_pred))

Mean Squared Error: 34574354.40383895
Root Mean Squared Error: 5879.996122774142
R2: 0.7570022782190428


## polynomial model

In [34]:
print('Mean Squared Error:',mean_squared_error(y_test,poly_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,poly_pred)))
print('R2:',r2_score(y_test,poly_pred))

Mean Squared Error: 23053104.035785068
Root Mean Squared Error: 4801.364809695788
R2: 0.837976677879684


## lasso model

In [35]:
print('Mean Squared Error:',mean_squared_error(y_test,l_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,l_pred)))
print('R2:',r2_score(y_test,l_pred))

Mean Squared Error: 34574670.710925534
Root Mean Squared Error: 5880.0230195914655
R2: 0.7570000551290458


## ridge model

In [36]:
print('Mean Squared Error:',mean_squared_error(y_test,r_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,r_pred)))
print('R2:',r2_score(y_test,r_pred))

Mean Squared Error: 34641941.44859951
Root Mean Squared Error: 5885.740518286506
R2: 0.7565272585641016
