In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [42]:
insurance_df = pd.read_csv('./data/insurance.csv')
insurance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [43]:
insurance_df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [44]:
print("The shape of the dataset is:", insurance_df.shape)
insurance_df.describe()

The shape of the dataset is: (1338, 7)


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [45]:
categorical_variable_cols = ['sex', 'smoker', 'region']
insurance_df_encoded = pd.get_dummies(insurance_df, columns=categorical_variable_cols, prefix=categorical_variable_cols)
print("The encoded dataframe sample:")
insurance_df_encoded.head()

The encoded dataframe sample:


Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [46]:
y = insurance_df_encoded['charges']
X = insurance_df_encoded.loc[:, insurance_df_encoded.columns != 'charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [47]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

In [48]:
print('Coefficients: \n')
for coef_idx, coef in enumerate(lr_model.coef_):
    print(X_test.columns[coef_idx], coef)

Coefficients: 

age 260.64734817645643
bmi 328.7854301097502
children 481.13821392630166
sex_female 63.017576761290265
sex_male -63.01757676129078
smoker_no -12030.709760935388
smoker_yes 12030.709760935386
region_northeast 552.0362092747016
region_northwest 37.76316886525212
region_southeast -334.8458900847217
region_southwest -254.95348805523372


In [49]:
print('R2 for the linear regressor:', round(r2_score(y_test, y_pred),4))


R2 for the linear regressor: 0.7424
