# Importing the libraries

In [1]:
import pandas as pd

import numpy as np

from flask import Flask, render_template, request

import pickle

import matplotlib.pyplot as plt


# Importing the dataset

In [2]:
insurance = pd.read_csv('insurance.csv')


In [3]:
# Assuming `insurance` is your DataFrame
# Define columns to consider for outlier removal
numeric_cols = ['age', 'bmi', 'children', 'charges']

# Calculate quartiles
Q1 = insurance[numeric_cols].quantile(0.25)
Q3 = insurance[numeric_cols].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define threshold for outlier detection
threshold = 1.5

# Identify outliers
outliers = ((insurance[numeric_cols] < (Q1 - threshold * IQR)) | (insurance[numeric_cols] > (Q3 + threshold * IQR))).any(axis=1)

# Remove outliers
insurance = insurance[~outliers]

# Display the shape of the cleaned dataset
#print("Original Dataset Shape:", insurance.shape)
#print("Cleaned Dataset Shape:", cleaned_data.shape)

In [4]:
insurance.head(50)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [5]:
X = insurance.iloc[:, :-1].values
y = insurance.iloc[:, -1].values

In [6]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1193 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1193 non-null   int64  
 1   sex       1193 non-null   object 
 2   bmi       1193 non-null   float64
 3   children  1193 non-null   int64  
 4   smoker    1193 non-null   object 
 5   region    1193 non-null   object 
 6   charges   1193 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 74.6+ KB


In [7]:
print(X)

[[19 'female' 27.9 0 'yes' 'southwest']
 [18 'male' 33.77 1 'no' 'southeast']
 [28 'male' 33.0 3 'no' 'southeast']
 ...
 [18 'female' 36.85 0 'no' 'southeast']
 [21 'female' 25.8 0 'no' 'southwest']
 [61 'female' 29.07 0 'yes' 'northwest']]


In [8]:
print(y)

[16884.924   1725.5523  4449.462  ...  1629.8335  2007.945  29141.3603]


# Encoding categorical data

# Encoding the Independent Variable

In [9]:
import sklearn

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [11]:
print(X)

[[1.0 0.0 0.0 ... 19 27.9 0]
 [0.0 1.0 1.0 ... 18 33.77 1]
 [0.0 1.0 1.0 ... 28 33.0 3]
 ...
 [1.0 0.0 1.0 ... 18 36.85 0]
 [1.0 0.0 1.0 ... 21 25.8 0]
 [1.0 0.0 0.0 ... 61 29.07 0]]


# Encoding the Dependent Variable

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
print(y)

[ 999   56  304 ...   31   90 1165]


# Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [15]:
print(X_train)

[[0.0 1.0 1.0 ... 19 35.4 0]
 [1.0 0.0 1.0 ... 24 24.225 0]
 [0.0 1.0 1.0 ... 42 35.97 2]
 ...
 [0.0 1.0 1.0 ... 56 34.43 0]
 [0.0 1.0 1.0 ... 45 30.2 1]
 [1.0 0.0 1.0 ... 31 32.775 2]]


In [16]:
print(X_test)

[[0.0 1.0 1.0 ... 43 27.36 3]
 [1.0 0.0 1.0 ... 25 32.23 1]
 [1.0 0.0 1.0 ... 62 31.73 0]
 ...
 [1.0 0.0 0.0 ... 25 30.2 0]
 [0.0 1.0 1.0 ... 62 21.4 0]
 [0.0 1.0 1.0 ... 61 23.655 0]]


In [17]:
print(y_train)

[  17  176  512 1057   98   80  889  193  568  166  611  751  824 1174
 1012  186  292  546   74   49  692  418  242  897  604    8  701  336
 1041 1054  708  485  389  149  506  532  767   87  695  833  826  936
  554  175  744  591   68 1055 1134   58 1087  733  387  150 1179 1132
  409  774  684 1131 1144  363  516  706  734  287  995 1106  742  141
  994  773 1026  612  922  592  206   29  131   76  631  798 1091 1029
  854   43  792  240 1089  382  643 1075 1088  424  304  951  204  686
 1016 1101  707 1107  957  299  770  108  797 1058 1140  245    9  530
  132 1000  993 1141 1155 1102  237  291  620  454  470  588   62  754
  519  983  723 1188 1019  309  582   99  151   45  493 1074 1077  408
   40  765  584  195  885  965  101  763  600  989  419  441 1135  827
  435  290  164  310  339  285  294  190   47  523  977  807  872  249
  375  986  208 1007  746  960  665  662   48  587   38  887   39 1182
 1129 1086  392  289  939 1014  904  978  814  266    3   84  987  705
  155 

In [18]:
print(y_test)

[ 619 1020  944  644  623   11  680 1145  118 1127  844   16  894  921
  369  962  603  793   55  505   50  265  720  852 1052  340  851  764
  159 1156  385  259 1186 1068  438  990 1167  374  996 1001  567  167
  699  839  344  448  782  109  574  917 1096  367  442  906  840  974
  654 1104  320  888  895  785  499  805  327 1082  627 1125  663  275
  406  177  821  124  669  838  169  935   89 1069  716   82   94  796
  248 1120 1148  712  812  503  538  246  342  515  992  698  583  972
  580  738 1147 1105  324 1157  829 1098  760  638 1010  755 1176  120
 1130  898  690 1117  834  181  338  586  678 1184  129   85  674  332
  542 1150  362  471  276  545  533  462  219 1037   86 1142  615  170
  900  616 1128  550  431  905  308  483 1146  119  679   33  356 1166
  460   13  657  429  466  709  687 1062  183  632 1044  646  548  653
   24  749 1171  306 1090  780  488  230 1151 1139  209 1160  256  371
  272  142 1061  837 1123  849  721 1172   65   93  215  778  799  335
  279 

# Feature Scaling

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [20]:
print(X_train)

[[0.0 1.0 1.0 ... -1.4069207770456922 0.9188587005170387
  -0.8849501820936011]
 [1.0 0.0 1.0 ... -1.0491718508498422 -0.9839407807101465
  -0.8849501820936011]
 [0.0 1.0 1.0 ... 0.2387242834552181 1.0159142445259421 0.780222349893175]
 ...
 [0.0 1.0 1.0 ... 1.2404212768035983 0.753694002817677
  -0.8849501820936011]
 [0.0 1.0 1.0 ... 0.45337363917272816 0.03343970254107772
  -0.05236391610021307]
 [1.0 0.0 1.0 ... -0.548323354175652 0.4718923794234045 0.780222349893175]]


In [21]:
print(X_test)

[[0.0 1.0 1.0 ... 0.3102740686943881 -0.4501352886611779
  1.6128086158865629]
 [1.0 0.0 1.0 ... -0.9776220656106721 0.37909365752015445
  -0.05236391610021307]
 [1.0 0.0 1.0 ... 1.6697199882386184 0.29395721540708186
  -0.8849501820936011]
 ...
 [1.0 0.0 0.0 ... -0.9776220656106721 0.03343970254107772
  -0.8849501820936011]
 [0.0 1.0 1.0 ... 1.6697199882386184 -1.4649616786490105
  -0.8849501820936011]
 [0.0 1.0 1.0 ... 1.5981702029994482 -1.08099632471905
  -0.8849501820936011]]


# Training the Multiple Linear Regression model on the Training set

In [22]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [23]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 673.5   619.  ]
 [ 277.75 1020.  ]
 [ 907.75  944.  ]
 [ 669.75  644.  ]
 [ 652.75  623.  ]
 [  98.75   11.  ]
 [ 720.75  680.  ]
 [1439.25 1145.  ]
 [ 206.75  118.  ]
 [1458.25 1127.  ]
 [ 811.75  844.  ]
 [ 136.25   16.  ]
 [ 832.5   894.  ]
 [ 855.25  921.  ]
 [ 460.25  369.  ]
 [ 896.25  962.  ]
 [ 633.75  603.  ]
 [ 771.25  793.  ]
 [ 137.75   55.  ]
 [ 581.75  505.  ]
 [ 164.25   50.  ]
 [ 388.25  265.  ]
 [ 730.75  720.  ]
 [ 779.25  852.  ]
 [ 323.75 1052.  ]
 [ 393.75  340.  ]
 [ 825.75  851.  ]
 [ 791.25  764.  ]
 [ 285.25  159.  ]
 [ 917.25 1156.  ]
 [ 486.75  385.  ]
 [ 334.75  259.  ]
 [ 856.5  1186.  ]
 [ 830.   1068.  ]
 [ 511.25  438.  ]
 [ 733.75  990.  ]
 [1520.   1167.  ]
 [ 447.25  374.  ]
 [1002.25  996.  ]
 [ 756.75 1001.  ]
 [ 615.25  567.  ]
 [ 290.25  167.  ]
 [ 755.75  699.  ]
 [ 801.75  839.  ]
 [ 394.25  344.  ]
 [ 545.25  448.  ]
 [ 206.75  782.  ]
 [ 186.25  109.  ]
 [ 644.25  574.  ]
 [ 917.25  917.  ]
 [ 331.25 1096.  ]
 [ 422.25  367.  ]
 [ 490.25  4

In [24]:
train_score = regressor.score(X_train, y_train)
test_score = regressor.score(X_test, y_test)

In [25]:
print("Training R^2 Score:", train_score)
print("Test R^2 Score:", test_score)

Training R^2 Score: 0.7288909413248311
Test R^2 Score: 0.6210197989456475


In [26]:
import pandas as pd

# Assuming `insurance` is your DataFrame
# Define columns to consider for outlier removal
numeric_cols = ['age', 'bmi', 'children', 'charges']

# Calculate quartiles
Q1 = insurance[numeric_cols].quantile(0.25)
Q3 = insurance[numeric_cols].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define threshold for outlier detection
threshold = 1.5

# Identify outliers
outliers = ((insurance[numeric_cols] < (Q1 - threshold * IQR)) | (insurance[numeric_cols] > (Q3 + threshold * IQR))).any(axis=1)

# Remove outliers
cleaned_data = insurance[~outliers]

# Display the shape of the cleaned dataset
print("Original Dataset Shape:", insurance.shape)
print("Cleaned Dataset Shape:", cleaned_data.shape)


Original Dataset Shape: (1193, 7)
Cleaned Dataset Shape: (1131, 7)


In [27]:
cleaned_data.head(50)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
10,25,male,26.22,0,no,northeast,2721.3208
