## Step -1 Business Problem Understanding

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Step -2 Data Understanding

#### Load data & Understand every variable

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


#### dataset understanding

In [3]:
df.shape

(1338, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [6]:
df['children'].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [7]:
df['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [8]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

#### Exploratory Data Analysis

In [9]:
continuous_features = ['age', 'bmi', 'expenses']
discrete_categorical = ['sex', 'smoker', 'region']
discrete_count = ['children']

In [10]:
df[continuous_features].describe()

Unnamed: 0,age,bmi,expenses
count,1338.0,1338.0,1338.0
mean,39.207025,30.665471,13270.422414
std,14.04996,6.098382,12110.01124
min,18.0,16.0,1121.87
25%,27.0,26.3,4740.2875
50%,39.0,30.4,9382.03
75%,51.0,34.7,16639.915
max,64.0,53.1,63770.43


In [11]:
df[discrete_categorical].describe()

Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


In [12]:
df[continuous_features].corr()

Unnamed: 0,age,bmi,expenses
age,1.0,0.109341,0.299008
bmi,0.109341,1.0,0.198576
expenses,0.299008,0.198576,1.0


## Step - 3: Data Preprocessing

#### Data Cleaning

In [13]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [14]:
df.duplicated().sum()

1

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df.shape

(1337, 7)

In [17]:
df.drop('region', axis=1, inplace=True)

#### Encoding

In [18]:
# encoding sex column
df['sex'].replace({'female':0, 'male':1}, inplace=True)

# encoding 'smoker' column
df['smoker'].replace({'no':0, 'yes':1}, inplace=True)

#### X&y

In [19]:
X = df.drop('expenses', axis=1)
y = df['expenses']

## Step - 4: Modeling

### findout the best random_state value 

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

Train = []
Test = []
CV = []

for i in range(0, 100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    model = LinearRegression()
    model.fit(X_train, y_train)

    ypred_train = model.predict(X_train)
    ypred_test = model.predict(X_test)

    Train.append(model.score(X_train, y_train))
    # print('Train R2:', r2_score(y_train, ypred_train))
    Test.append(model.score(X_test, y_test))
    # print('Test R2:', r2_score(y_test, ypred_test))

    CV.append(cross_val_score(model, X_train, y_train, cv=5).mean())


em = pd.DataFrame({'Train':Train, 'Test':Test, 'CV':CV})
gm = em[(abs(em['Train']-em['Test']) <= 0.05) & (abs(em['Test']-em['CV']) <=0.05)]
print('best random state number:', gm[gm['Test']==gm['Test'].max()].index.to_list())

best random state number: [90]


In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90)

# <font color = aqua> Ridge Regresion </font> 

#### Apply Hyperparameter tuning for Ridge Regression

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

estimator = Ridge()
param_grid = {'alpha':list(range(1, 101))}
model_hp = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
model_hp.fit(X_train, y_train)
model_hp.best_params_

{'alpha': 2}

#### Final model
With best hyperparameter with important features

In [30]:
from sklearn.linear_model import Lasso
ridge_best = Ridge(alpha=2)
ridge_best.fit(X_train, y_train)
print('Intercept:', ridge_best.intercept_)
print('coefficient:', ridge_best.coef_)

# Prediction 
ypred_train = ridge_best.predict(X_train)
ypred_test = ridge_best.predict(X_test)

# Evaluation
from sklearn.metrics import r2_score
print('Train R2:', r2_score(y_train, ypred_train))
print('Test R2:', r2_score(y_test, ypred_test))
from sklearn.model_selection import cross_val_score
print('CV Score:', cross_val_score(ridge_best, X_train, y_train, cv=5).mean())

Intercept: -11916.121984017314
coefficient: [  263.60297408  -178.67423843   315.73852203   452.55903549
 23532.37923887]
Train R2: 0.7408031510875615
Test R2: 0.7813654155427349
CV Score: 0.7386430025315851


- here, we can see, we don't have any column that becomes '0' because this is Ridge and in ridge we don't get coefficient as '0'.
- Our above model is the final model because we didn't have any coefficient that is '0'.

### Prediction on a new Data

In [23]:
input_data = {'age':35,
             'sex':'Male',
             'bmi':31.4,
             'children':5,
             'smoker':'yes',
             'region':'southeast'}
df_test = pd.DataFrame(input_data, index=[0])
df_test

Unnamed: 0,age,sex,bmi,children,smoker,region
0,35,Male,31.4,5,yes,southeast


#### Step: 1 preprocessing the data
preprocessing for the future data (these all I performed on actual data so on the future data, I will also perform this
- missing value
- duplicates
- drop region
- replace smoker
- X (variables
- drop sex

In [24]:
df_test.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [25]:
df_test.duplicated()

0    False
dtype: bool

In [26]:
df_test.drop('region', axis=1, inplace=True)

In [27]:
df_test['sex'].replace({'female':0, 'Male':1}, inplace=True)
df_test['smoker'].replace({'no':0, 'yes':1}, inplace=True)
X4 = df_test
X4

Unnamed: 0,age,sex,bmi,children,smoker
0,35,1,31.4,5,1


In [28]:
ridge_best.predict(X4)

array([32840.67187814])

### Now, the question is which algorithm is the best!
Between both Lasso and Ridge, lasso is best algorithm because we predicted with only '4' variables because we removed gender column and in ridge we predcited with '5' varaibles because we didn't remove that one column.