In [65]:
import numpy as np
import pandas as pd
import math

In [66]:
PATH = 'data/'

In [67]:
train = pd.read_csv(f'{PATH}kc_house_train_data.csv', dtype={'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})

In [68]:
test = pd.read_csv(f'{PATH}kc_house_test_data.csv', dtype={'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})

In [69]:
def get_numpy_data(df,features,output):
    df['constant'] = 1
    features = ['constant'] + features
    features_frame = df[features]
    features_matrix = features_frame.to_numpy()
    output_series = df[output]
    output_array = output_series.to_numpy()
    return features_matrix,output_array

In [70]:
def predict_outcome(features_matrix,weights):
    predictions = np.dot(features_matrix,weights)
    return predictions

In [71]:
def feature_derivative(errors,feature):
    derivative = 2 * np.dot(errors,feature)
    return derivative

In [72]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        # compute the errors as predictions - output:
        predictions = predict_outcome(feature_matrix,weights)
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors,feature_matrix[:,i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = derivative**2
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size*derivative
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)


In [73]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [74]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)

In [75]:
simple_weights #Ans1

array([-46999.88716555,    281.91211918])

In [76]:
simple_weights[0] + simple_weights[1] * test.iloc[0]['sqft_living'] #Ans2

356134.4432550024

In [77]:
(test_feature_matrix, test_output) = get_numpy_data(test, simple_features, my_output)

In [78]:
test_predictions = predict_outcome(test_feature_matrix, simple_weights)

In [79]:
test_predictions

array([356134.443255  , 784640.86440132, 435069.83662406, ...,
       663418.65315598, 604217.10812919, 240550.47439317])

In [80]:
def rss(x,y): return ((x-y)**2).sum()

In [81]:
rss(test_predictions, test_output)

275400044902128.3

In [82]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [84]:
complex_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [86]:
complex_weights

array([-9.99999688e+04,  2.45035463e+02,  6.53198571e+01])

In [87]:
(test_feature_matrix, test_output) = get_numpy_data(test, model_features, my_output)

In [88]:
test_predictions = predict_outcome(test_feature_matrix, complex_weights)

In [89]:
test_predictions #Ans3

array([366670.08864306, 762639.14060956, 386290.12537701, ...,
       682095.43732627, 585567.28873248, 216562.4574978 ])

In [90]:
test.iloc[0]['price'] #Ans4

310000.0

In [91]:
rss(test_predictions, test_output) #Ans5

270261660020645.3