In [2]:
# DS4400 HW 2
# Problem 4: Polynomial Regression
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import *

house_data = pd.read_csv("kc_house_data.csv")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
# implementation copied over from problem 3
def lin_regress_train(X, y):
    ''' function that implements the closed form solution for multiple linear regression and returns theta
        uses np.linalg.pinv instead of inv because of the lecture slides '''
    theta = np.linalg.pinv(X.T @ X) @ X.T @ y
    return theta

def predict_response(X, theta):
    ''' function that returns the predicted responses from the multiple linear regression '''
    response = X @ theta
    return response

In [4]:
# consider a feature X, a response variable Y, and N samples of training data
def polynomial_features(X, p):
    ''' function that takes in the number of features to return polynomial feature matrix '''
    N = X.shape[0]
    X_poly = np.ones((N, p + 1))

    for i in range(1, p + 1):
        X_poly[:, i] = X[:, 0] ** i

    return X_poly

In [6]:
# considering the problem with X = sqft_living

# get training and testing data for column sqft_living
X_train = train_data[['sqft_living']].values
y_train = train_data['price'].div(1000).values.reshape(-1, 1)

X_test = test_data[['sqft_living']].values
y_test = test_data['price'].div(1000).values.reshape(-1, 1)

# intialize empty list for the results
metric_results = []

# choose three different values for p >= 5 (5, 10, 15)
for p in [1, 5, 10, 15]:
    # create the polynomial matrices
    X_train_poly = polynomial_features(X_train, p)
    X_test_poly = polynomial_features(X_test, p)

    # train the data using lin_regress_train function
    theta = lin_regress_train(X_train_poly, y_train)

    y_train_pred = predict_response(X_train_poly, theta)
    y_test_pred = predict_response(X_test_poly, theta)

    # get the MSE and r squared metrics for training and testing
    train_mse = round(mean_squared_error(y_train, y_train_pred), 2)
    train_r2 = round(r2_score(y_train, y_train_pred), 2)

    test_mse = round(mean_squared_error(y_test, y_test_pred), 2)
    test_r2 = round(r2_score(y_test, y_test_pred), 2)

    metric_results.append([p, train_mse, train_r2, test_mse, test_r2])

metric_table = pd.DataFrame(
    metric_results,
    columns = ["Degree (p)", "Train MSE", "Train R Squared", "Test MSE", "Test R Squared"]
)
print(metric_table)

   Degree (p)  Train MSE  Train R Squared     Test MSE  Test R Squared
0           1   57947.53             0.50     88575.98            0.47
1           5  129385.49            -0.12  13690251.07          -81.11
2          10  125593.52            -0.09  13435896.67          -79.59
3          15  123654.46            -0.07  13415428.70          -79.46


Discuss your observations on how the MSE and $R^2$ metrics change with the degree of the polynomial.

As the degree of the polynomial increases for both the training and testing MSE the value increases from  p = 1 to p = 5, but after 5 the values begin to decrease. For $R^2$ the values decrease from p = 1 to p = 5, but after they began to slowly increase, following an opposite pattern then the MSE.