# [9660] Linear Regression 2
Data file:
* https://raw.githubusercontent.com/vjavaly/Baruch-CIS-9660/main/data/insurance_rates.csv

In [None]:
from datetime import datetime
print(f'Run time: {datetime.now().strftime("%D %T")}')

Run time: 09/09/24 12:58:05


### Initialize global variables

In [None]:
global random_state
random_state = 42

### Import libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt

### Load data
Columns
* age: age of primary beneficiary
* sex: female=0, male=1
* bmi: body mass index
* children: # of children covered by health insurance
* smoker: yes=1, no=0
* region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.
* charges: individual medical costs billed by health insurance

In [None]:
# Read data file (insurance_rates.csv) into a dataframe
df = pd.read_csv('https://raw.githubusercontent.com/vjavaly/Baruch-CIS-9660/main/data/insurance_rates.csv')

### Examine data

In [None]:
df.shape

(1338, 7)

In [None]:
df.columns

Index(['age', 'gender', 'smoker', 'bmi', 'children', 'region', 'charges'], dtype='object')

In [None]:
df.dtypes

Unnamed: 0,0
age,int64
gender,int64
smoker,int64
bmi,float64
children,int64
region,object
charges,float64


In [None]:
df.head()

Unnamed: 0,age,gender,smoker,bmi,children,region,charges
0,19,0,1,27.9,0,southwest,16884.924
1,18,1,0,33.77,1,southeast,1725.5523
2,28,1,0,33.0,3,southeast,4449.462
3,33,1,0,22.705,0,northwest,21984.47061
4,32,1,0,28.88,0,northwest,3866.8552


In [None]:
df.tail()

Unnamed: 0,age,gender,smoker,bmi,children,region,charges
1333,50,1,0,30.97,3,northwest,10600.5483
1334,18,0,0,31.92,0,northeast,2205.9808
1335,18,0,0,36.85,0,southeast,1629.8335
1336,21,0,0,25.8,0,southwest,2007.945
1337,61,0,1,29.07,0,northwest,29141.3603


### Prepare data for model training

In [None]:
# Drop non-numeric columns: region
df.drop('region', axis=1, inplace=True)
df.head()

Unnamed: 0,age,gender,smoker,bmi,children,charges
0,19,0,1,27.9,0,16884.924
1,18,1,0,33.77,1,1725.5523
2,28,1,0,33.0,3,4449.462
3,33,1,0,22.705,0,21984.47061
4,32,1,0,28.88,0,3866.8552


### Separate independent and dependent variables

In [None]:
# Independent variables: All columns except 'charges'
X = df.drop('charges', axis = 1)

# Dependent variable: charges
y = df['charges']

In [None]:
X.shape

(1338, 5)

In [None]:
type(X)

In [None]:
X.head()

Unnamed: 0,age,gender,smoker,bmi,children
0,19,0,1,27.9,0
1,18,1,0,33.77,1
2,28,1,0,33.0,3
3,33,1,0,22.705,0
4,32,1,0,28.88,0


In [None]:
y.shape

(1338,)

In [None]:
type(y)

In [None]:
y.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


### Split data into training and test sets

In [None]:
# 70% training set, 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=random_state)

X_train: This contains 70% of the feature data used to train the model.

X_test: This contains 30% of the feature data that will be used to test the model after training.

y_train: This contains 70% of the target values (the actual outputs) for training.

y_test: This contains 30% of the target values for testing.

### Train Linear Regression model

In [None]:
# Instantiate LinearRegression model
model = LinearRegression()

In [None]:
model.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [None]:
# Fit LinearRegression model
model.fit(X_train, y_train)

### Calculate model performance for training and test sets

In [None]:
# Calculate model performance for training set
y_train_predict = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_predict)
rmse = np.sqrt(mse)

print("Model performance for training set")
print("----------------------------------")
print("MSE is {}".format(round(mse,2)))
print("RMSE is {}".format(round(rmse,2)))

Model performance for training set
----------------------------------
MSE is 37878481.86
RMSE is 6154.55


This line is asking the model to make predictions based on the training data (X_train), which are the features that the model learned from.

The result, y_train_predict, contains the predicted values for the target variable based on the training data.

The Root Mean Squared Error (RMSE) is simply the square root of the MSE. It is used because it’s in the same units as the target variable, making it easier to interpret.

RMSE tells you, on average, how much the model’s predictions are off from the actual values

In [None]:
# Calculate model performance for test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Model performance for test set")
print("------------------------------")
print("MSE is {}".format(round(mse,2)))
print("RMSE is {}".format(round(rmse,2)))

Model performance for test set
------------------------------
MSE is 34003912.39
RMSE is 5831.29


### Review model intercept and coefficients

In [None]:
model.intercept_

-12538.439849853145

it’s the value of the target (y) when all the input features (X) are zero.

In [None]:
model.coef_

array([  261.91061673,   136.65119758, 23618.76182167,   333.36099462,
         432.1792927 ])

In [None]:
min_values = X.min()
min_values

Unnamed: 0,0
age,18.0
gender,0.0
smoker,0.0
bmi,15.96
children,0.0


In [None]:
max_values = X.max()
max_values

Unnamed: 0,0
age,64.0
gender,1.0
smoker,1.0
bmi,53.13
children,5.0


### Execute model with new independent variable values
Values set by professor

In [None]:
age_1 = 31
gender_1 = 1
smoker_1 = 1
bmi_1 = 30
children_1 = 2

predicted_charges = model.intercept_ + ( model.coef_[0] * age_1 ) + (model.coef_[1] * gender_1 ) + \
        (model.coef_[2] * smoker_1 ) + (model.coef_[3] * bmi_1 ) + (model.coef_[4] * children_1 )
print('Predicted charges =', round(predicted_charges,2))

Predicted charges = 30201.39


In [None]:
age_2 = 46
gender_2 = 0
smoker_2 = 0
bmi_2 = 23
children_2 = 1

predicted_charges = model.intercept_ + ( model.coef_[0] * age_2 ) + (model.coef_[1] * gender_2 ) + \
        (model.coef_[2] * smoker_2 ) + (model.coef_[3] * bmi_2 ) + (model.coef_[4] * children_2 )
print('Predicted charges =', round(predicted_charges,2))

Predicted charges = 7608.93
