# Assignment 3
- Name : Deepinder Singh Saini
- Roll No. : 102303673

Q1: K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)  
Download the dataset regarding USA House Price Prediction from the following link:  
https://drive.google.com/file/d/1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX/view?usp=sharing  
Load the dataset and Implement 5- fold cross validation for multiple linear regression 
(using least square error fit).  
Steps:  
a) Divide the dataset into input features (all columns except price) and output variable  
(price)  
b) Scale the values of input features.  
c) Divide input and output features into five folds.  
d) Run five iterations, in each iteration consider one-fold as test set and remaining 
four sets as training set. Find the beta (𝛽) matrix, predicted values, and R2_score 
for each iteration using least square error fit.  
e) Use the best value of (𝛽) matrix (for which R2_score is maximum), to train the regressor for 70% of data and test the performance for remaining 30% data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("datasetML3/USA_Housing.csv")
X = data.drop("Price", axis=1).values
y = data["Price"].values

In [3]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score


In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [5]:
best_r2 = -999
best_beta = None
fold = 1

In [6]:
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # add bias column of ones
    X_train_b = np.c_[np.ones(len(X_train)), X_train]
    X_test_b = np.c_[np.ones(len(X_test)), X_test]

    # beta = (X^T X)^-1 X^T y
    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ (X_train_b.T @ y_train)

    y_pred = X_test_b @ beta
    r2 = r2_score(y_test, y_pred)

    print("Fold", fold, "R2:", r2)
    fold += 1

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nBest R2:", best_r2)
print("Best Beta:", best_beta)

Fold 1 R2: 0.9215935236979075
Fold 2 R2: 0.9103750966275798
Fold 3 R2: 0.9122196024515719
Fold 4 R2: 0.9189354745915842
Fold 5 R2: 0.9245545959223683

Best R2: 0.9245545959223683
Best Beta: [1231877.38800101  229760.39069988  163970.88379553  121802.91672949
    1457.42287297  151113.18179475]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)
X_train_b = np.c_[np.ones(len(X_train)), X_train]
X_test_b = np.c_[np.ones(len(X_test)), X_test]

beta_final = np.linalg.inv(X_train_b.T @ X_train_b) @ (X_train_b.T @ y_train)
y_pred_final = X_test_b @ beta_final
print("Final 70-30 R2:", r2_score(y_test, y_pred_final))

Final 70-30 R2: 0.91669122715985


Q2: Validation Set with Gradient Descent

In [28]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=1)

In [29]:
X_train_b = np.c_[np.ones(len(X_train)), X_train]
X_val_b = np.c_[np.ones(len(X_val)), X_val]
X_test_b = np.c_[np.ones(len(X_test)), X_test]

In [30]:
def gradient_descent(X, y, lr, iters):
    m, n = X.shape
    beta = np.zeros(n)
    for _ in range(iters):
        grad = -(2/m) * (X.T @ (y - X @ beta))
        beta -= lr * grad
    return beta

In [31]:
rates = [0.001, 0.01, 0.1, 1]
best_beta = None
best_val_r2 = -999

for lr in rates:
    beta = gradient_descent(X_train_b, y_train.ravel(), lr, 1000)  # flatten y
    r2_val = r2_score(y_val.ravel(), X_val_b @ beta)
    r2_test = r2_score(y_test.ravel(), X_test_b @ beta)

    print("LR:", lr, "Val R2:", r2_val, "Test R2:", r2_test)

    if r2_val > best_val_r2:
        best_val_r2 = r2_val
        best_beta = beta

print("\nBest Beta:", best_beta)

LR: 0.001 Val R2: 0.677259725055287 Test R2: 0.6885406194155927
LR: 0.01 Val R2: 0.9219863212945743 Test R2: 0.9167072179386627
LR: 0.1 Val R2: 0.9219864090633332 Test R2: 0.9167072348076073
LR: 1 Val R2: -inf Test R2: -inf

Best Beta: [1232488.80073994  230880.08572505  163167.26704817  120997.04242009
    2965.33799873  150238.15145298]


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


Q3: Car Price Prediction Preprocessing + Regression

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
cols = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors",
        "body_style","drive_wheels","engine_location","wheel_base","length","width",
        "height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system",
        "bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg",
        "highway_mpg","price"]
df = pd.read_csv(url, names=cols, na_values="?")

In [33]:
for c in df.columns:
    if df[c].dtype == "object":
        df[c].fillna(df[c].mode()[0], inplace=True)
    else:
        df[c].fillna(df[c].mean(), inplace=True)

df = df.dropna(subset=["price"])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mode()[0], inplace=True)


In [34]:
rep = {"two":2,"three":3,"four":4,"five":5,"six":6,"eight":8,"twelve":12}
df["num_doors"] = df["num_doors"].replace(rep)
df["num_cylinders"] = df["num_cylinders"].replace(rep)

df = pd.get_dummies(df, columns=["body_style","drive_wheels"], drop_first=True)

for c in ["make","aspiration","engine_location","fuel_type"]:
    df[c] = LabelEncoder().fit_transform(df[c])

df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if "pfi" in str(x) else 0)
df["engine_type"] = df["engine_type"].apply(lambda x: 1 if "ohc" in str(x) else 0)

  df["num_doors"] = df["num_doors"].replace(rep)
  df["num_cylinders"] = df["num_cylinders"].replace(rep)


In [35]:
X = df.drop("price", axis=1)
y = df["price"].astype(float)

X = StandardScaler().fit_transform(X)

# regression
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)
model = LinearRegression()
model.fit(X_train, y_train)
print("R2 without PCA:", r2_score(y_test, model.predict(X_test)))

R2 without PCA: 0.7965829228348336


In [36]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, train_size=0.7, random_state=1)
model2 = LinearRegression()
model2.fit(X_train, y_train)
print("R2 with PCA:", r2_score(y_test, model2.predict(X_test)))

R2 with PCA: 0.8106448760373239
