In [1]:
# DS4400 HW 2
# Problem 5: Gradient Descent
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler

house_data = pd.read_csv("kc_house_data.csv")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [2]:
# gradient descent based on algorithm from class
def gradient_descent(X, y, alpha, iterations):
    ''' function that implements a gradient descent for training linear regression'''
    N, d = X.shape
    theta = np.zeros(d)

    for _ in range(iterations):
        y_pred = X @ theta
        gradient = (2 / N) * X.T @ (y_pred - y)
        theta = theta - alpha * gradient
    return theta

In [3]:
# vary the value of the learning rate (at least 3 different values {0.01,0.1,0.5} and report the value of the model 
# parameter theta after different number of iterations (10, 50, and 100)
# set constants
learning_rates = [0.01, 0.1, 0.5]
num_iterations = [10, 50, 100]

# get the training and testing data from the csv files, and drop features mentioned in the homework document
house_features = house_data.drop(columns = ["id", "date", "zipcode", "price"])
house_price = house_data["price"].div(1000)

X_train = train_data.drop(columns = ["zipcode", "price"])
y_train = train_data["price"].div(1000)

X_test = test_data.drop(columns = ["id", "date", "zipcode", "price"])
y_test = test_data["price"].div(1000)

X_train = X_train.values
X_test = X_test.values

X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

y_train = y_train.values
y_test = y_test.values

# scale the data to get rid of the error messages
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[:, 1:])
X_test_scaled = scaler.transform(X_test[:, 1:])

X_train_scaled = np.c_[np.ones(X_train_scaled.shape[0]), X_train_scaled]
X_test_scaled = np.c_[np.ones(X_test_scaled.shape[0]), X_test_scaled]

# initialize empty list
gd_results = []

# iterate through the different learning rates with the different number of iterations
for alpha in learning_rates:
    for iterations in num_iterations:
        theta = gradient_descent(X_train_scaled, y_train, alpha, iterations)

        y_train_pred = X_train @ theta
        y_test_pred = X_test @ theta

        # get the MSE and r squared metrics for training and testing
        train_mse = round(mean_squared_error(y_train, y_train_pred), 2)
        train_r2 = round(r2_score(y_train, y_train_pred), 2)

        test_mse = round(mean_squared_error(y_test, y_test_pred), 2)
        test_r2 = round(r2_score(y_test, y_test_pred), 2)

        gd_results.append([alpha, iterations, theta, train_mse, train_r2, test_mse, test_r2])

metric_table = pd.DataFrame(
    gd_results,
    columns = ["Learning Rate (Alpha)", "Iterations", "Theta", "Train MSE", "Train R Squared", "Test MSE", "Test R Squared"]
)
print(metric_table)

   Learning Rate (Alpha)  Iterations  \
0                   0.01          10   
1                   0.01          50   
2                   0.01         100   
3                   0.10          10   
4                   0.10          50   
5                   0.10         100   
6                   0.50          10   
7                   0.50          50   
8                   0.50         100   

                                               Theta      Train MSE  \
0  [95.19802483770326, -0.7597598995880689, 11.92...   2.064963e+11   
1  [330.8955303896301, 2.150484670225977, 6.00795...   1.617032e+11   
2  [451.3976498338788, 5.4157369361358505, -3.679...   9.027097e+10   
3  [464.5357166904187, 5.767519594948174, -4.6630...   8.642122e+10   
4  [520.4074063912901, 8.356439804763589, -12.719...   6.180966e+10   
5  [520.4148338939906, 8.445651779086882, -12.803...   7.117774e+10   
6  [520.4148342410479, 2726836.357778585, -406506...   3.492605e+24   
7  [2.2900625685483e+18, 2.5630

Write some observations about the behavior of the algorithm: How do the metrics change with different learning rates; How many iterations are needed; Does the algorithm converge to the optimal solution, etc.

With a learning rate of 0.01, as we increase the number of iterations, the training and testing MSE slightly decreases, while the $R^2$ increases. Next with a learning rate of 0.1, as we increase the number of iterations the the traning and testing MSE also continue to decrease, with $R^2$ showing similar patterns. However when we increase the learning rate to 0.5, all the metrics become large, not following the previous pattern. Therefore with this algorithm it is not able to converge to the optimal solution, and in order to do this we would need to investigate with different learning rates.  