In [4]:
import pandas as pd
df = pd.read_csv("USA_Housing.csv")

In [5]:
X = df.drop('Price', axis=1) 
Y = df['Price']  
print("Input Features (X):")
display(X.head())
print("\nOutput Variable (y):")
display(Y.head())

Input Features (X):


Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
0,79545.45857,5.682861,7.009188,4.09,23086.8005
1,79248.64245,6.0029,6.730821,3.09,40173.07217
2,61287.06718,5.86589,8.512727,5.13,36882.1594
3,63345.24005,7.188236,5.586729,3.26,34310.24283
4,59982.19723,5.040555,7.839388,4.23,26354.10947



Output Variable (y):


0    1.059034e+06
1    1.505891e+06
2    1.058988e+06
3    1.260617e+06
4    6.309435e+05
Name: Price, dtype: float64

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
display(pd.DataFrame(X_scaled, columns=X.columns).head())

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
0,1.02866,-0.296927,0.021274,0.088062,-1.317599
1,1.000808,0.025902,-0.255506,-0.722301,0.403999
2,-0.684629,-0.112303,1.516243,0.93084,0.07241
3,-0.491499,1.221572,-1.393077,-0.58454,-0.186734
4,-0.807073,-0.944834,0.846742,0.201513,-0.988387


In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X_folds = [] 
Y_folds = []
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    X_folds.append({'train': X_train, 'test': X_test})
    Y_folds.append({'train': Y_train, 'test': Y_test})
print("Shape of X_train:", X_folds[0]['train'].shape)
print("Shape of X_test:", X_folds[0]['test'].shape)
print("Shape of y_train:", Y_folds[0]['train'].shape)
print("Shape of y_test:", Y_folds[0]['test'].shape)
     

Shape of X_train: (4000, 5)
Shape of X_test: (1000, 5)
Shape of y_train: (4000,)
Shape of y_test: (1000,)


In [10]:
import numpy as np
from sklearn.metrics import r2_score
beta_matrices = [] 
predicted_values = []
r2_scores = []

for i in range(5):
    X_train = X_folds[i]['train']
    X_test = X_folds[i]['test']
    Y_train = Y_folds[i]['train']
    Y_test = Y_folds[i]['test']
    X_train_intercept = np.insert(X_train, 0, 1, axis=1)

    beta = np.linalg.pinv(X_train_intercept.T @ X_train_intercept) @ X_train_intercept.T @ Y_train
    beta_matrices.append(beta)
    X_test_intercept = np.insert(X_test, 0, 1, axis=1)
    Y_pred = X_test_intercept @ beta
    predicted_values.append(Y_pred)

    r2 = r2_score(Y_test, Y_pred)
    r2_scores.append(r2)

    print(f"Iteration {i+1}:")
    print("Beta Matrix :", beta)
    print("R2 Score:", r2)
    print("-" * 30)

Iteration 1:
Beta Matrix : [1232002.6748241   230745.94073479  163243.27314515  120309.77397759
    3011.45976111  151552.63069359]
R2 Score: 0.9179971706985148
------------------------------
Iteration 2:
Beta Matrix : [1232037.85755946  229081.97914235  165882.1605634   121536.57475055
    2092.4478622   150874.99274586]
R2 Score: 0.9145677884802816
------------------------------
Iteration 3:
Beta Matrix : [1231951.92563846  230224.50511001  162766.17455493  121022.77324578
    1247.16258975  150234.77720419]
R2 Score: 0.9116116385364479
------------------------------
Iteration 4:
Beta Matrix : [1232751.46486511  229500.10043209  165212.07110924  122839.9376815
    3063.71699324  150917.88484984]
R2 Score: 0.9193091764960816
------------------------------
Iteration 5:
Beta Matrix : [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]
R2 Score: 0.9243869413350316
------------------------------


In [13]:
from sklearn.model_selection import train_test_split
best_r2_index = np.argmax(r2_scores)
best_beta = beta_matrices[best_r2_index]

print(f"The best R2 score is {r2_scores[best_r2_index]:.4f} from Iteration {best_r2_index + 1}.")
print("The corresponding Beta Matrix is:", best_beta)

X_train_70, X_test_30, Y_train_70, Y_test_30 = train_test_split(X_scaled, Y, test_size=0.3, random_state=42)

X_test_30_intercept = np.insert(X_test_30, 0, 1, axis=1)
Y_pred_30 = X_test_30_intercept @ best_beta

print("\nShape of 70% training set ", X_train_70.shape)
print("Shape of 30% testing set ", X_test_30.shape)
print("Shape of predicted values for 30% test set ", Y_pred_30.shape)
     

The best R2 score is 0.9244 from Iteration 5.
The corresponding Beta Matrix is: [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]

Shape of 70% training set  (3500, 5)
Shape of 30% testing set  (1500, 5)
Shape of predicted values for 30% test set  (1500,)


In [14]:
X = np.c_[np.ones((X.shape[0], 1)), X]
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=0.56, random_state=42)
val_size = 14 / (14 + 30)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, train_size=val_size, random_state=42)
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
     

Shapes: (2800, 6) (700, 6) (1500, 6)


In [15]:
def gradient_descent(X,Y,lr,n=1000):
  m,n = X.shape
  beta = np.zeros((n,1))
  for i in range(n):
    gradients = 2 / m * X.T @ (X @ beta - Y)
    beta = beta - lr * gradients
  return beta