In [None]:
#Lets predict the medical cost of patients given some input variables

In [43]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [39]:
df = pd.read_csv('medical_cost_prediction.csv')

In [40]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


PREPROCESSING STEPS: 
1. change categorical data to numeric data
2. normalize the data so it falls withing a range
3. split the data into train and test
4. assign features to both x and y variables

In [41]:
#Turning categorical variables into numerical variables
df['sex']= df['sex'].replace({'male':1, 'female':0})
df['smoker']=df['smoker'].replace({'yes':1,'no':0})
df['region'] = df['region'].replace(['southeast', 'southwest', 'northeast','northwest'],[1,2,3,4])

In [42]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,2,16884.92400
1,18,1,33.770,1,0,1,1725.55230
2,28,1,33.000,3,0,1,4449.46200
3,33,1,22.705,0,0,4,21984.47061
4,32,1,28.880,0,0,4,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,4,10600.54830
1334,18,0,31.920,0,0,3,2205.98080
1335,18,0,36.850,0,0,1,1629.83350
1336,21,0,25.800,0,0,2,2007.94500


In [45]:
#using minmax scalar to normalize the data
# scaler = MinMaxScaler(feature_range=(0,1))

# scaled_train = scaler.fit_transform(training_data)
# scaled_test = scaler.transform(testing_data)

x input features are age, sex, bmi, childern, smoker and region
y target variable is charges- which is what we want to predict

In [46]:
x = df.drop(['charges'], axis = 1)
y = df['charges']


In [47]:
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,2
1,18,1,33.770,1,0,1
2,28,1,33.000,3,0,1
3,33,1,22.705,0,0,4
4,32,1,28.880,0,0,4
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,4
1334,18,0,31.920,0,0,3
1335,18,0,36.850,0,0,1
1336,21,0,25.800,0,0,2


In [48]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

Next, lets check the spape of x and y variables and ensure none is a rank one variable.

In [49]:
x.shape

(1338, 6)

In [50]:
y.shape

(1338,)

Since y is a rank one variable, lets reshape


In [51]:
y = y.values.reshape(-1, 1)

In [52]:
y.shape #now its a 2D vector which is what we need!

(1338, 1)

In [53]:
#perform train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [54]:
#Lets define the model
model = linear_model.LinearRegression()

In [55]:
model.fit(x_train, y_train)

LinearRegression()

In [56]:
y_pred = model.predict(x_test)

In [62]:
for i in range(y_pred.size):
  print("Actual = ", y_test[i], "\t Predicted = ", y_pred[i], "\t Difference = ", abs(y_test[i]-y_pred[i]))


Actual =  [2719.27975] 	 Predicted =  [3616.97086199] 	 Difference =  [897.69111199]
Actual =  [17878.90068] 	 Predicted =  [7675.25623625] 	 Difference =  [10203.64444375]
Actual =  [23244.7902] 	 Predicted =  [32859.78238918] 	 Difference =  [9614.99218918]
Actual =  [14394.5579] 	 Predicted =  [13147.30675441] 	 Difference =  [1247.25114559]
Actual =  [7261.741] 	 Predicted =  [11000.70445512] 	 Difference =  [3738.96345512]
Actual =  [1719.4363] 	 Predicted =  [2149.91171793] 	 Difference =  [430.47541793]
Actual =  [11743.9341] 	 Predicted =  [14485.371314] 	 Difference =  [2741.437214]
Actual =  [4350.5144] 	 Predicted =  [5266.22336994] 	 Difference =  [915.70896994]
Actual =  [7749.1564] 	 Predicted =  [10348.36283408] 	 Difference =  [2599.20643408]
Actual =  [8827.2099] 	 Predicted =  [9677.35195424] 	 Difference =  [850.14205424]
Actual =  [34166.273] 	 Predicted =  [27541.71135252] 	 Difference =  [6624.56164748]
Actual =  [10231.4999] 	 Predicted =  [12016.99037921] 	 Diff

In [63]:
MSE = mean_squared_error(y_test, y_pred) 
MAE = mean_absolute_error(y_test, y_pred)
r2score = r2_score(y_test, y_pred)

In [64]:
print("MSE = ", MSE, "RMSE = ", MSE**0.5, "MAE = ", MAE, "r2score = ", r2score)

MSE =  35886552.06685599 RMSE =  5990.538545644789 MAE =  4134.940524562701 r2score =  0.7274455251648322
