In [1]:
# MSE - MAE - MAPE - RMSE

In [2]:
# MSE (Mean Squared Error)
# MAE (Mean Absolute Error)
# MAPE (Mean Absolute Percentage Error)
# RMSE (Root Mean Squared Error)

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [4]:
medical_cost = pd.read_csv("medical_cost_personal_datasets.csv")

In [5]:
df = medical_cost.copy()

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
df.shape

(1338, 7)

In [8]:
pd.get_dummies(df, columns=["sex", "smoker", "region"])

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,True,False,False,True,False,False,False,True
1,18,33.770,1,1725.55230,False,True,True,False,False,False,True,False
2,28,33.000,3,4449.46200,False,True,True,False,False,False,True,False
3,33,22.705,0,21984.47061,False,True,True,False,False,True,False,False
4,32,28.880,0,3866.85520,False,True,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,False,True,True,False,False,True,False,False
1334,18,31.920,0,2205.98080,True,False,True,False,True,False,False,False
1335,18,36.850,0,1629.83350,True,False,True,False,False,False,True,False
1336,21,25.800,0,2007.94500,True,False,True,False,False,False,False,True


In [9]:
df = pd.get_dummies(df, columns=["sex", "smoker", "region"], drop_first=True)

In [10]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,False,True,False,False,True
1,18,33.77,1,1725.5523,True,False,False,True,False
2,28,33.0,3,4449.462,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.88,0,3866.8552,True,False,True,False,False


In [11]:
y = df[["charges"]]
x = df.drop("charges", axis = 1)

In [12]:
lm = LinearRegression()
model = lm.fit(x, y)

In [13]:
model.score(x, y)

0.7509130345985207

In [14]:
model.predict([[19, 27.900, 0, 0, 1, 0, 0, 1]])



array([[25293.7130284]])

In [15]:
df_error = pd.DataFrame()

In [16]:
df_error["y"] = y

In [17]:
df_error.head()

Unnamed: 0,y
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [18]:
y_guess = model.predict(x)

In [19]:
df_error["y_guess"] = y_guess

In [20]:
df_error.head()

Unnamed: 0,y,y_guess
0,16884.924,25293.713028
1,1725.5523,3448.602834
2,4449.462,6706.988491
3,21984.47061,3754.830163
4,3866.8552,5592.493386


In [21]:
df_error["y_error"] = y - y_guess

In [22]:
df_error.head()

Unnamed: 0,y,y_guess,y_error
0,16884.924,25293.713028,-8408.789028
1,1725.5523,3448.602834,-1723.050534
2,4449.462,6706.988491,-2257.526491
3,21984.47061,3754.830163,18229.640447
4,3866.8552,5592.493386,-1725.638186


In [23]:
df_error["squared_error"] = df_error["y_error"] ** 2

In [24]:
df_error.head()

Unnamed: 0,y,y_guess,y_error,squared_error
0,16884.924,25293.713028,-8408.789028,70707730.0
1,1725.5523,3448.602834,-1723.050534,2968903.0
2,4449.462,6706.988491,-2257.526491,5096426.0
3,21984.47061,3754.830163,18229.640447,332319800.0
4,3866.8552,5592.493386,-1725.638186,2977827.0


In [25]:
df_error["absolute_error"] = np.abs(df_error["y_error"])

In [26]:
df_error.head()

Unnamed: 0,y,y_guess,y_error,squared_error,absolute_error
0,16884.924,25293.713028,-8408.789028,70707730.0,8408.789028
1,1725.5523,3448.602834,-1723.050534,2968903.0,1723.050534
2,4449.462,6706.988491,-2257.526491,5096426.0,2257.526491
3,21984.47061,3754.830163,18229.640447,332319800.0,18229.640447
4,3866.8552,5592.493386,-1725.638186,2977827.0,1725.638186


In [27]:
df_error["absolute_percentage_error"] = np.abs((y - y_guess) / y)

In [28]:
df_error.head()

Unnamed: 0,y,y_guess,y_error,squared_error,absolute_error,absolute_percentage_error
0,16884.924,25293.713028,-8408.789028,70707730.0,8408.789028,0.498006
1,1725.5523,3448.602834,-1723.050534,2968903.0,1723.050534,0.99855
2,4449.462,6706.988491,-2257.526491,5096426.0,2257.526491,0.507371
3,21984.47061,3754.830163,18229.640447,332319800.0,18229.640447,0.829205
4,3866.8552,5592.493386,-1725.638186,2977827.0,1725.638186,0.446264


In [29]:
df_error.mean()

y                            1.327042e+04
y_guess                      1.327042e+04
y_error                     -9.679525e-13
squared_error                3.650189e+07
absolute_error               4.170887e+03
absolute_percentage_error    4.203527e-01
dtype: float64

In [30]:
mean_squared_error(y, y_guess)

36501893.00741544

In [31]:
mean_absolute_error(y, y_guess)

4170.886894163588

In [32]:
mean_absolute_percentage_error(y, y_guess)

0.4203526847372706