In [1]:
# In this notebook, I will be using Scikit-Learn's LinearRegression class to forecast medical costs. 

In [2]:
# Import Statements
import numpy as np
import pandas as pd
import sklearn
from pathlib import Path

In [3]:
# Load the data into a pandas dataframe
root = Path('archive')
filename = 'insurance.csv'

insurance_data = pd.read_csv(root / filename)
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance_data.info()  # no imputing required

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
'''
corr_matrix = insurance_data.corr()
corr_matrix['charges'].sort_values(ascending = False)
'''

"\ncorr_matrix = insurance_data.corr()\ncorr_matrix['charges'].sort_values(ascending = False)\n"

In [6]:
'''
from pandas.plotting import scatter_matrix

scatter_matrix(insurance_data[['age','bmi','children','charges']], figsize = (12,8))
'''

"\nfrom pandas.plotting import scatter_matrix\n\nscatter_matrix(insurance_data[['age','bmi','children','charges']], figsize = (12,8))\n"

In [7]:
# Separate the predictors and the target variable
X = insurance_data[['age','sex','bmi','children','smoker','region']]
Y = insurance_data[['charges']]

print(X.head(),'\n\n', Y.head())

   age     sex     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest 

        charges
0  16884.92400
1   1725.55230
2   4449.46200
3  21984.47061
4   3866.85520


In [8]:
# Shuffle the datapoints and create a train and test set
shuffled_indices = np.random.permutation(len(X))
train_set_size = int(0.8*len(X))
X_train = X.loc[shuffled_indices[:train_set_size]]
Y_train = Y.loc[shuffled_indices[:train_set_size]]
X_test = X.loc[shuffled_indices[train_set_size:]]
Y_test = Y.loc[shuffled_indices[train_set_size:]]

print(X_train.head(), '\n\n', Y_train.head())

      age     sex    bmi  children smoker     region
559    19    male  35.53         0     no  northwest
781    18    male  41.14         0     no  southeast
728    18  female  40.28         0     no  northeast
177    54    male  29.20         1     no  southwest
1099   25  female  33.99         1     no  southeast 

          charges
559    1646.4297
781    1146.7966
728    2217.6012
177   10436.0960
1099   3227.1211


In [9]:
# Prepare the data for the Linear Regression model
num_attributes = ['age','bmi','children'] # numerical attributes
cat_attributes = ['sex','smoker','region'] # categorical attributes

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Normalize the numerical attributes
num_pipeline = Pipeline([
    ('scaler_num',StandardScaler()),
])
# Use the OneHotEncoder to separate a categorical attribute into 1 row per category (1: category present, 0: category absent)
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder()),
])
# Construct a custom pipeline, performing the normalization on the numerical attributes and the encoding on the categorical attributes
pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes),
])
# Prepare the data for the Linear Regression model
X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)
X_train_prepared[:10]


array([[-1.43276354,  0.77783992, -0.88882573,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-1.50419477,  1.69232646, -0.88882573,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-1.50419477,  1.55213779, -0.88882573,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.0673294 , -0.25401388, -0.05457702,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-1.00417618,  0.5268044 , -0.05457702,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-1.07560741, -0.4952688 , -0.88882573,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.   

In [10]:
# Obtain an estimation of model performance using cross-validation
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lin_reg = LinearRegression()
scores = cross_val_score(lin_reg, X_train_prepared, Y_train, scoring = 'neg_mean_squared_error', cv = 3)
scores = np.sqrt(-scores)
print('Scores: ', scores )
print('Mean: ', scores.mean())
print('Standard Deviation: ', scores.std())

Scores:  [5811.18457265 6321.98992996 6102.79028868]
Mean:  6078.654930429944
Standard Deviation:  209.23258971915195


In [11]:
# Fit the model on the training data
lin_reg.fit(X_train_prepared, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
# Make predictions on the test data and evaluate it's performance
Y_test_predictions = lin_reg.predict(X_test_prepared)

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(Y_test, Y_test_predictions)
rmse = np.sqrt(mse)
print('RMSE on Test Set: ', rmse)

RMSE on Test Set:  6039.359222915662
