In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
df=pd.read_csv("insurance.csv")
df['smoker']=df['smoker'].map({'yes': 1, 'no': 0})
df['sex']=df['sex'].map({'male': 1, 'female': 0})
df = pd.get_dummies(df, columns=['region'], drop_first=False)
df=df.sort_values(by='charges',ascending=False)
print(df.head())
df.shape

      age  sex     bmi  children  smoker      charges  region_northeast  \
543    54    0  47.410         0       1  63770.42801             False   
1300   45    1  30.360         0       1  62592.87309             False   
1230   52    1  34.485         3       1  60021.39897             False   
577    31    0  38.095         1       1  58571.07448              True   
819    33    0  35.530         0       1  55135.40209             False   

      region_northwest  region_southeast  region_southwest  
543              False              True             False  
1300             False              True             False  
1230              True             False             False  
577              False             False             False  
819               True             False             False  


(1338, 10)

In [5]:
n_entries = len(df)
shuffled_indices = np.random.permutation(n_entries)
train_size = int(n_entries * 0.8)
train_indices = shuffled_indices[:train_size]
test_indices = shuffled_indices[train_size:]
train_df = df.iloc[train_indices]
test_df = df.iloc[test_indices]

In [6]:
train_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
458,56,1,39.6,0,0,10601.412,False,False,False,True
1007,47,1,28.215,3,1,24915.22085,False,True,False,False
1129,19,0,18.6,0,0,1728.897,False,False,False,True
25,59,0,27.72,3,0,14001.1338,False,False,True,False
70,27,0,24.75,0,1,16577.7795,False,False,True,False


In [7]:
# plt.figure(figsize=(10, 10))
# plt.scatter(df['region'], df['charges'])
# plt.title('Medical Charges vs Age')
# plt.xlabel('Age')
# plt.ylabel('Medical Charges')
# plt.grid(True)
# plt.show()

In [8]:
x = train_df.drop(columns='charges').to_numpy(dtype=float)
y = train_df['charges'].to_numpy().reshape(-1, 1).astype(float)


mean = np.mean(x, axis=0)
std = np.std(x, axis=0)
features_scaled = (x - mean) / std

bias = np.ones((features_scaled.shape[0], 1))
x_scaled = np.concatenate([bias, features_scaled], axis=1)

x = x_scaled

print(x.shape)
print(y.shape)
print(x)

(1070, 10)
(1070, 1)
[[ 1.          1.20375552  0.96509478 ... -0.57375304 -0.61263475
   1.80167471]
 [ 1.          0.56394066  0.96509478 ...  1.74291017 -0.61263475
  -0.55503915]
 [ 1.         -1.42659446 -1.03616766 ... -0.57375304 -0.61263475
   1.80167471]
 ...
 [ 1.         -1.42659446 -1.03616766 ...  1.74291017 -0.61263475
  -0.55503915]
 [ 1.         -0.21805528  0.96509478 ...  1.74291017 -0.61263475
  -0.55503915]
 [ 1.         -1.21332284 -1.03616766 ... -0.57375304  1.63229396
  -0.55503915]]


theta=((X'X)^-1)(X'Y)

In [11]:
theta = np.linalg.inv(x.T@x) @ x.T@y
theta

array([[13058.49399663],
       [ 3545.53225531],
       [  -42.59773035],
       [ 2042.891057  ],
       [  551.51206484],
       [ 9642.12363974],
       [  142.61690173],
       [ -110.83798163],
       [   31.61892417],
       [   64.44153997]])

In [12]:
y[:10]

array([[10601.412  ],
       [24915.22085],
       [ 1728.897  ],
       [14001.1338 ],
       [16577.7795 ],
       [11833.7823 ],
       [ 4266.1658 ],
       [ 4433.3877 ],
       [17748.5062 ],
       [ 1720.3537 ]])

In [13]:
yhat=x @ theta
yhat[:10]

array([[15023.2459277 ],
       [33895.90567669],
       [-1238.16614507],
       [13187.90899378],
       [26733.59059085],
       [ 9905.14870293],
       [10850.72931482],
       [ 6095.29157615],
       [25770.13555689],
       [ 2623.50487549]])

In [14]:
yhat = x @ theta
SS_res = np.sum((y - yhat) ** 2)
SS_tot = np.sum((y - np.mean(y)) ** 2)
R_sq = 1 - (SS_res / SS_tot)
print("R² Score:", R_sq)

R² Score: 0.7560132214388482


==========================================================================================================

TESTING

==========================================================================================================

In [15]:
test_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
816,24,0,24.225,0,0,2842.76075,False,True,False,False
441,33,0,33.5,0,1,37079.372,False,False,False,True
1280,48,0,33.33,0,0,8283.6807,False,False,True,False
781,18,1,41.14,0,0,1146.7966,False,False,True,False
145,29,0,38.83,3,0,5138.2567,False,False,True,False


In [16]:
x_test=test_df.drop(columns='charges').to_numpy(dtype=float)
y_test=test_df['charges'].to_numpy().reshape(-1, 1).astype(float)

features_scaled = (x_test - mean) / std

bias = np.ones((features_scaled.shape[0], 1))
x_test_scaled = np.concatenate([bias, features_scaled], axis=1)

x_test = x_test_scaled



print(x_test.shape)
print(y_test.shape)

(268, 10)
(268, 1)


In [17]:
yhat_test=x_test @ theta
yhat_test[:10]

array([[ 1493.99509694],
       [31252.08087092],
       [10915.00504942],
       [ 5879.16257671],
       [ 9340.56399954],
       [ 7661.42324179],
       [ 9655.74562661],
       [32516.03619637],
       [ 4981.42493287],
       [ 2248.36438815]])

In [18]:
y_test[:10]

array([[ 2842.76075],
       [37079.372  ],
       [ 8283.6807 ],
       [ 1146.7966 ],
       [ 5138.2567 ],
       [ 7623.518  ],
       [ 5979.731  ],
       [38711.     ],
       [ 3866.8552 ],
       [ 1815.8759 ]])

In [19]:
SS_res = np.sum((y_test - yhat_test) ** 2)
SS_tot = np.sum((y_test - np.mean(y_test)) ** 2)
R_sq = 1 - (SS_res / SS_tot)
print("R² Score:", R_sq)

R² Score: 0.7224948571148682
