In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("boston.csv")

In [4]:
n_entries = len(df)
shuffled_indices = np.random.permutation(n_entries)
train_size = int(n_entries * 0.8)
train_indices = shuffled_indices[:train_size]
test_indices = shuffled_indices[train_size:]
train_df = df.iloc[train_indices]
test_df = df.iloc[test_indices]
train_df.to_csv("trainingdataset.csv")
test_df.to_csv("testingdataset.csv")

In [17]:
train_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
24,0.75026,0.0,8.14,0,0.538,5.924,94.1,4.3996,4,307.0,21.0,394.33,16.3,15.6
197,0.04666,80.0,1.52,0,0.404,7.107,36.6,7.309,2,329.0,12.6,354.31,8.61,30.3
473,4.64689,0.0,18.1,0,0.614,6.98,67.6,2.5329,24,666.0,20.2,374.68,11.66,29.8
102,0.22876,0.0,8.56,0,0.52,6.405,85.4,2.7147,5,384.0,20.9,70.8,10.63,18.6
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.9,19.15,27.1


In [6]:
x=train_df.drop(columns='MEDV').to_numpy(dtype=float)
y=train_df['MEDV'].values.reshape(-1,1)

mean = np.mean(x, axis=0)
std = np.std(x, axis=0)
features_scaled = (x - mean) / std

bias = np.ones((x.shape[0], 1))
x_scaled = np.concatenate([bias, features_scaled], axis=1)

x = x_scaled

print(x.shape)
print(y.shape)
print(x)

(404, 14)
(404, 1)
[[ 1.         -0.33266261 -0.50408189 ...  1.15506675  0.41882623
   0.49444489]
 [ 1.         -0.40998946  2.8543858  ... -2.74487805 -0.01639277
  -0.56410248]
 [ 1.          0.09558373 -0.50408189 ...  0.78364343  0.20513174
  -0.1442625 ]
 ...
 [ 1.          3.72399008 -0.50408189 ...  0.78364343 -3.66485911
   0.24942352]
 [ 1.          3.79981127 -0.50408189 ...  0.78364343  0.44677507
   2.46149846]
 [ 1.          0.99187731 -0.50408189 ...  0.78364343 -1.25386329
   1.52546177]]


In [7]:
theta = np.linalg.inv(x.T@x) @ (x.T@y)
theta

array([[22.39727723],
       [-1.10018417],
       [ 1.0638435 ],
       [-0.08875959],
       [ 0.58988386],
       [-1.70071326],
       [ 2.68691112],
       [-0.32353847],
       [-3.26726455],
       [ 2.41149875],
       [-1.80059285],
       [-2.20432421],
       [ 0.80088973],
       [-3.58141958]])

In [20]:
yhat=x@theta
yhat[:7]

array([[15.4158866 ],
       [33.12784222],
       [25.38275925],
       [19.71496568],
       [19.61882663],
       [28.81774404],
       [17.70737047]])

In [19]:
y[:7]

array([[15.6],
       [30.3],
       [29.8],
       [18.6],
       [27.1],
       [31.2],
       [17.8]])

===========================================================================================================

TESTING

===========================================================================================================

In [10]:
x_test=test_df.drop(columns="MEDV").to_numpy(dtype=float)
y_test=test_df["MEDV"].values.reshape(-1,1)

mean_test = np.mean(x_test, axis=0)
std_test = np.std(x_test, axis=0)
features_scaled_test = (x_test - mean_test) / std_test

bias = np.ones((x_test.shape[0], 1))
x_test_scaled = np.concatenate([bias, features_scaled_test], axis=1)

x_test = x_test_scaled
print(x_test.shape)
print(y_test.shape)
yhat_test=x_test@theta
yhat_test[:5]

(102, 14)
(102, 1)


array([[15.29250963],
       [20.58506578],
       [19.6867549 ],
       [26.82986864],
       [16.76756153]])

R² Score for datasets without Ridge Regression

In [34]:
SS_res = np.sum((y - yhat) ** 2)
SS_tot = np.sum((y - np.mean(y)) ** 2)
R_sq = 1 - (SS_res / SS_tot)
print("R² Score for Training dataset:", R_sq)
SS_res = np.sum((y_test - yhat_test) ** 2)
SS_tot = np.sum((y_test - np.mean(y_test)) ** 2)
R_sq = 1 - (SS_res / SS_tot)
print("R² Score for Testing dataset:", R_sq)

R² Score for Training dataset: 0.7461381930613034
R² Score for Testing dataset: 0.6988614483458049


Ridge Regression

In [27]:
def ridge(alpha,x,y):
    n_features = x.shape[1]
    I = np.eye(n_features)
    I[0, 0] = 0
    beta = np.linalg.inv(x.T@x + alpha * I) @ x.T@y
    return beta

In [32]:
yhat_ridge=x@ridge(0.01,x,y)
yhat_ridge_test=x_test @ ridge(0.01,x_test,y_test)
print(yhat_ridge[:7],"\n")
print(yhat_ridge_test[:7])

[[15.41625754]
 [33.12789248]
 [25.38244523]
 [19.7152244 ]
 [19.61907665]
 [28.8180207 ]
 [17.70754569]] 

[[16.35459725]
 [25.46199168]
 [21.85525655]
 [25.7797256 ]
 [15.48763657]
 [20.51366158]
 [25.30611436]]


In [31]:
print(y[:7],"\n")
print(y_test[:7])

[[15.6]
 [30.3]
 [29.8]
 [18.6]
 [27.1]
 [31.2]
 [17.8]] 

[[10.2]
 [21.4]
 [23. ]
 [23.3]
 [14.3]
 [17. ]
 [33. ]]


R² Score for datasets using Ridge Regression

In [26]:
SS_res = np.sum((y - yhat_ridge) ** 2)
SS_tot = np.sum((y - np.mean(y)) ** 2)
R_sq = 1 - (SS_res / SS_tot)
print("R² Score for Training dataset:", R_sq)
SS_res = np.sum((y_test - yhat_ridge_test) ** 2)
SS_tot = np.sum((y_test - np.mean(y_test)) ** 2)
R_sq = 1 - (SS_res / SS_tot)
print("R² Score for Testing dataset:", R_sq)

R² Score for Training dataset: 0.7461381914343319
R² Score for Testing dataset: 0.7596035002133408
