In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
class lin_model:
    def __init__(self, degree, regularization = False, lambda_ = 0):
        if regularization:
            self.linear_model = Ridge(alpha = lambda_)
        else:
            self.linear_model = LinearRegression()
        self.poly_regs = PolynomialFeatures(degree, include_bias = False)
        self.scaler = StandardScaler()
    def fit(self, X_train, y_train):
        x_mapped = self.poly_regs.fit_transform(X_train)
        x_mapped_scaled = self.scaler.fit_transform(x_mapped)
        self.linear_model.fit(x_mapped_scaled, y_train)
    def predict(self, X):
        x_mapped = self.poly_regs.transform(X)
        x_mapped_scaled = self.scaler.transform(x_mapped)
        yhat = self.linear_model.predict(x_mapped_scaled)
        return yhat
    def mse(self, y_test, yhat):
        return mean_squared_error(y_test, yhat)
    def r2(self, y_test, yhat):
        return r2_score(y_test, yhat)

In [3]:
def label_encoding(column_name):
  label_encoder = LabelEncoder()
  df[column_name] = label_encoder.fit_transform(df[column_name])

In [4]:
df = pd.read_csv("insurance.csv")

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [8]:
label_encoding("sex")
label_encoding("smoker")
label_encoding("region")

In [9]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [10]:
X = df.drop('charges',axis=1)
y = df["charges"]

X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,3
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,3


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

X_train = np.array(X_train).reshape(-1,6)
X_test = np.array(X_test).reshape(-1,6)
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

print(X_train.shape)
print(y_train.shape)

(936, 6)
(936, 1)


In [12]:
# here plot the data impossible because data's dimension will be 6 how can plot that???

In [13]:
degree = 10


linear_model = lin_model(degree)
linear_model.fit(X_train, y_train)

# train error

yhat = linear_model.predict(X_train)
err_train_mse = linear_model.mse(y_train, yhat)
err_train_r2 = linear_model.r2(y_train, yhat)

# test error

yhat = linear_model.predict(X_test)
err_test_mse = linear_model.mse(y_test, yhat)
err_test_r2 = linear_model.r2(y_test, yhat)

print(f"Train Error = {err_train_mse} \nTest Error = {err_test_mse}")
print(f"Train Error = {err_train_r2} \nTest Error = {err_test_r2}")

Train Error = 0.006742909692808959 
Test Error = 9.768544934068113e+18
Train Error = 0.9999999999539634 
Test Error = -66622956822.242065


In [14]:
# as you can see test error > train error which means there is a overfit problem

In [15]:
X_train, X_, y_train, y_ = train_test_split(X, y, test_size = 0.4, random_state = 1)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size = 0.5, random_state = 1)

X_train = np.array(X_train).reshape(-1,6)
X_test = np.array(X_test).reshape(-1,6)
X_cv = np.array(X_cv).reshape(-1,6)
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)
y_cv = np.array(y_cv).reshape(-1,1)

In [16]:
# here we try to find best degree of polynomial features

max_degree = 10
err_train_mse = np.zeros(max_degree)
err_cv_mse = np.zeros(max_degree)
err_train_r2 = np.zeros(max_degree)
err_cv_r2 = np.zeros(max_degree)

for degree in range(max_degree):
    linear_model = lin_model(degree + 1)
    linear_model.fit(X_train, y_train)
    yhat = linear_model.predict(X_train)
    err_train_mse[degree] = linear_model.mse(y_train, yhat)
    err_train_r2[degree] = linear_model.r2(y_train, yhat)
    yhat = linear_model.predict(X_cv)
    err_cv_mse[degree] = linear_model.mse(y_cv, yhat)
    err_cv_r2[degree] = linear_model.r2(y_cv, yhat)
    
optimal_degree = np.argmin(err_cv_mse)+1

print(f"Error mse = {err_cv_mse[optimal_degree-1]},Optimal Degree = {optimal_degree}")
print(f"Error r2 = {err_cv_r2[optimal_degree-1]},Optimal Degree = {optimal_degree}")

Error mse = 26852007.11356676,Optimal Degree = 2
Error r2 = 0.828306989227047,Optimal Degree = 2


In [17]:
# we try to find best lambda value for regularization

lambda_range = np.array([0.0, 1e-6, 1e-5, 1e-4,1e-3,1e-2, 1e-1,1,10,100])
num_steps = len(lambda_range)
degree = optimal_degree
err_train_mse = np.zeros(max_degree)
err_cv_mse = np.zeros(max_degree)
err_train_r2 = np.zeros(max_degree)
err_cv_r2 = np.zeros(max_degree)

for i in range(num_steps):
    lambda_= lambda_range[i]
    linear_model = lin_model(degree, regularization=True, lambda_=lambda_)
    linear_model.fit(X_train, y_train)
    yhat = linear_model.predict(X_train)
    err_train_mse[i] = linear_model.mse(y_train, yhat)
    err_train_r2[i] = linear_model.r2(y_train, yhat)
    yhat = linear_model.predict(X_cv)
    err_cv_mse[i] = linear_model.mse(y_cv, yhat)
    err_cv_r2[i] = linear_model.r2(y_cv, yhat)
    

optimal_reg_idx = np.argmin(err_cv_mse) 


print(f"lambda = {lambda_range[optimal_reg_idx]}")    
print(f"Error mse = {err_cv_mse[optimal_reg_idx]},Optimal idx = {optimal_reg_idx}")
print(f"Error r2 = {err_cv_r2[optimal_reg_idx]},Optimal idx = {optimal_reg_idx}")

for i in range(num_steps):
    print(f" err = {err_cv_r2[i]}, idx = {i}, lambda = {lambda_range[i]}")

lambda = 1.0
Error mse = 26608671.37985959,Optimal idx = 7
Error r2 = 0.8298628894832972,Optimal idx = 7
 err = 0.8114819628369947, idx = 0, lambda = 0.0
 err = 0.8283069911257182, idx = 1, lambda = 1e-06
 err = 0.828307008213739, idx = 2, lambda = 1e-05
 err = 0.8283071790905843, idx = 3, lambda = 0.0001
 err = 0.8283088875229583, idx = 4, lambda = 0.001
 err = 0.8283259382719869, idx = 5, lambda = 0.01
 err = 0.8284931213792874, idx = 6, lambda = 0.1
 err = 0.8298628894832972, idx = 7, lambda = 1.0
 err = 0.8293610867531647, idx = 8, lambda = 10.0
 err = 0.7877502192721328, idx = 9, lambda = 100.0


In [18]:
# 

lambda_range = np.array([0.0, 1e-6, 1e-5, 1e-4,1e-3,1e-2, 1e-1,1,10,100])
num_steps = len(lambda_range)
optimal_reg_idx = np.zeros(max_degree)
high_accuracy = np.zeros(max_degree)
high_accuracy_idx = np.zeros(max_degree)

for degree in range(max_degree):
    err_train_mse = np.zeros(max_degree)
    err_cv_mse = np.zeros(max_degree)
    err_train_r2 = np.zeros(max_degree)
    err_cv_r2 = np.zeros(max_degree)
    for i in range(num_steps):
        lambda_= lambda_range[i]
        linear_model = lin_model(degree+1, regularization=True, lambda_=lambda_)
        linear_model.fit(X_train, y_train)
        yhat = linear_model.predict(X_train)
        err_train_mse[i] = linear_model.mse(y_train, yhat)
        err_train_r2[i] = linear_model.r2(y_train, yhat)
        yhat = linear_model.predict(X_cv)
        err_cv_mse[i] = linear_model.mse(y_cv, yhat)
        err_cv_r2[i] = linear_model.r2(y_cv, yhat)
        

    optimal_reg_idx[degree] = np.argmin(err_cv_mse)
    idx = int(optimal_reg_idx[degree])
    
    
    high_accuracy[degree] = err_cv_r2[idx]
    high_accuracy_idx[degree] = degree + 1



for i in range(len(high_accuracy)):
    print(f"R2 = {high_accuracy[i]}, degree = {high_accuracy_idx[i]}")



max_idx = 0
for i in range(len(high_accuracy)):
    if(high_accuracy[i] == max(high_accuracy)):
        max_idx = i
print(f"Max Accuracy = {high_accuracy[max_idx]} in degree = {high_accuracy_idx[max_idx]}")




R2 = 0.7470279727625578, degree = 1.0
R2 = 0.8298628894832972, degree = 2.0
R2 = 0.8255801345599592, degree = 3.0
R2 = 0.8155638065709778, degree = 4.0
R2 = 0.8058081661912976, degree = 5.0
R2 = 0.7868474577905734, degree = 6.0
R2 = 0.7550907622632318, degree = 7.0
R2 = 0.7256430142801011, degree = 8.0
R2 = 0.7029934645206635, degree = 9.0
R2 = 0.6484454089878139, degree = 10.0
Max Accuracy = 0.8298628894832972 in degree = 2.0
