# Multiple Linear Regression-Category Encoders Usage

# Importing the libraries

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from category_encoders import *

# Importing the dataset

In [19]:
dataset = pd.read_csv("Project/insurance.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [20]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [21]:
print(x[:5,:])

[[19 'female' 27.9 0 'yes' 'southwest']
 [18 'male' 33.77 1 'no' 'southeast']
 [28 'male' 33.0 3 'no' 'southeast']
 [33 'male' 22.705 0 'no' 'northwest']
 [32 'male' 28.88 0 'no' 'northwest']]


# Encoding categorical data using BinaryEncoder

In [22]:
lb= BinaryEncoder(cols=[1,4,5]).fit(x)
x= lb.transform(x)
x=np.array(x)


In [23]:
#from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import OneHotEncoder
#ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5])], remainder='passthrough')
#x = np.array(ct.fit_transform(x))

In [24]:
print(x[:5])

[[19 0 1 27.9 0 0 1 0 0 1]
 [18 1 0 33.77 1 1 0 0 1 0]
 [28 1 0 33.0 3 1 0 0 1 0]
 [33 1 0 22.705 0 1 0 0 1 1]
 [32 1 0 28.88 0 1 0 0 1 1]]


# Splitting the dataset into the Training set and Test set

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Training the Multiple Linear Regression model on the Training set

In [26]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

# Predicting the Test set results

In [27]:
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
eva= np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)
eva1=eva.astype(int)
print(eva1)

[[11169  9724]
 [ 9486  8547]
 [38181 45702]
 [16266 12950]
 [ 6914  9644]
 [ 3963  4500]
 [ 1579  2198]
 [14385 11436]
 [ 9012  7537]
 [ 7508  5425]
 [ 4491  6753]
 [10279 10493]
 [ 8801  7337]
 [ 3798  4185]
 [27926 18310]
 [10715 10702]
 [11288 12523]
 [ 6105  3490]
 [ 8241  6457]
 [27144 33475]
 [33644 23967]
 [14355 12643]
 [11737 23045]
 [32137 23065]
 [ 4170  1674]
 [ 9254  4667]
 [ 1084  3732]
 [ 9804  7682]
 [ 3771  3756]
 [10431  8413]
 [ 9009  8059]
 [40074 48970]
 [15688 12979]
 [13879 20630]
 [24759 14571]
 [ 5166  4137]
 [12610  8347]
 [30769 51194]
 [33549 40003]
 [ 3671  1880]
 [ 3975  5458]
 [ 3987  2867]
 [30528 20149]
 [39505 47496]
 [27810 36149]
 [ 5092 26018]
 [10604 19749]
 [ 7829  6940]
 [ 3592  4718]
 [10212 22192]
 [ 5720  2899]
 [ 3426 18838]
 [33021 23568]
 [38473 46255]
 [16053 24227]
 [ 7164  3268]
 [ 5739  2322]
 [ 9454  8827]
 [ 8910 14478]
 [11725 13112]
 [ 1856  1253]
 [38914 46718]
 [14899 13919]
 [11804  9630]
 [14050 10736]
 [14056  9880]
 [25831 32

# Evaluating the Model Performance

In [28]:
from sklearn.metrics import r2_score,mean_squared_error as mse
from math import sqrt
print('R Square: ',round(r2_score(y_test, y_pred),2)*100,'% Accuracy')
print('MSE: ',round(mse(y_test,y_pred),2))
print('RMSE: ',round(sqrt(mse(y_test,y_pred)),2))

R Square:  80.0 % Accuracy
MSE:  31827950.23
RMSE:  5641.63
