In [49]:
# DS4400 HW 2
# Problem 2: Implementing closed-form solution for linear regression
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import *

house_data = pd.read_csv("kc_house_data.csv")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [50]:
# closed form solution for multiple linear regression: theta = (X^T * X)^-1*X^Ty
# function to predict the response on a new testing point
def lin_regress_train(X, y):
    ''' function that implements the closed form solution for multiple linear regression and returns theta
        uses np.linalg.pinv instead of inv because of the lecture slides '''
    theta = np.linalg.pinv(X.T @ X) @ X.T @ y
    return theta

def predict_response(X, theta):
    ''' function that returns the predicted responses from the multiple linear regression '''
    response = X @ theta
    return response

In [51]:
# get the training and testing data from the csv files, and drop features mentioned in the homework document
house_features = house_data.drop(columns = ["id", "date", "zipcode", "price"])
house_price = house_data["price"].div(1000)

X_train = train_data.drop(columns = ["zipcode", "price"])
y_train = train_data["price"].div(1000)

X_test = test_data.drop(columns = ["id", "date", "zipcode", "price"])
y_test = test_data["price"].div(1000)

# convert the data into matricies to implement the closed form solution
X_train = X_train.values
y_train = y_train.values.reshape(-1, 1)

X_test = X_test.values
y_test = y_test.values.reshape(-1, 1)

# add a column of intercepts becuase sklearn automatically does this
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])


In [52]:
# use lin_regress_traina nd predict_response to run closed form linear regression on training set
theta = lin_regress_train(X_train, y_train)
y_train_pred = predict_response(X_train, theta)

# report MSE and R^2
train_mse = round(mean_squared_error(y_train, y_train_pred), 2)
train_r2 = round(r2_score(y_train, y_train_pred), 2)

print("The MSE metric for the training set is:", train_mse)
print("The R Squared Metric for the training set it is:", train_r2)

# ask about the values being slightly off, what is casuing this?

The MSE metric for the training set is: 31800.45
The R Squared Metric for the training set it is: 0.72


In [53]:
# use lin_regress_traina nd predict_response to run closed form linear regression on testing set
theta = lin_regress_train(X_test, y_test)
y_test_pred = predict_response(X_test, theta)

# report MSE and R^2
test_mse = round(mean_squared_error(y_test, y_test_pred), 2)
test_r2 = round(r2_score(y_test, y_test_pred), 2)

print("The MSE metric for the testng set is:", test_mse)
print("The R Squared Metric for the testing set it is:", test_r2)

The MSE metric for the testng set is: 54797.58
The R Squared Metric for the testing set it is: 0.67


Report the MSE and $R^2$ metrics for the models you implemented on both training and testing sets and compare these metrics to the ones given by the package implementation from Problem 2. Discuss if the results of your implementation are similar to those of the package.

The MSE for the training set is 31,800.45 and the $R^2$ is 0.72. The MSE for the testing set is 54,797.58 and $R^2$ is 0.67. Compared to the metrics from the package, these metrics have a slightly lower $R^2$, and the training set has a higher MSE, while the testing set has a lower MSE. Overall they are similar to the ones in the package, but there are some differences in the MSE, that may be important to note.