In [2]:
'''
@author: Gaio
@summary: Multiple Regression - Predicting House Prices with graphlab & gradient descent
'''

'\n@author: Gaio\n@summary: Multiple Regression - Predicting House Prices with graphlab & gradient descent\n'

In [3]:
#imports
import graphlab
import numpy as np
import math

In [4]:
#import data
sales = graphlab.SFrame('kc_house_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to claudio.gaiaschi@gmail.com and will expire on January 04, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1487570739.log


In [5]:
#split data into train & test sets
train_data,test_data = sales.random_split(.8,seed=0)

In [6]:
#apply multiple regression on feature set
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
example_model = graphlab.linear_regression.create(train_data, target = 'price', features = example_features, 
                                                  validation_set = None)

In [7]:
#print weigths
example_weight_summary = example_model.get("coefficients")
print example_weight_summary

+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | 87910.0724924  |  7873.3381434 |
| sqft_living |  None | 315.403440552  | 3.45570032585 |
|   bedrooms  |  None | -65080.2155528 | 2717.45685442 |
|  bathrooms  |  None | 6944.02019265  | 3923.11493144 |
+-------------+-------+----------------+---------------+
[4 rows x 4 columns]



In [8]:
#apply model to train data
example_predictions = example_model.predict(train_data)
print example_predictions[0] #271789.505878

271789.505878


In [9]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe=data_sframe[features]
    print features_sframe
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray= data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [10]:
#test function
output_array = get_numpy_data(sales,example_features, 'price')

+----------+-------------+----------+-----------+
| constant | sqft_living | bedrooms | bathrooms |
+----------+-------------+----------+-----------+
|    1     |    1180.0   |   3.0    |    1.0    |
|    1     |    2570.0   |   3.0    |    2.25   |
|    1     |    770.0    |   2.0    |    1.0    |
|    1     |    1960.0   |   4.0    |    3.0    |
|    1     |    1680.0   |   3.0    |    2.0    |
|    1     |    5420.0   |   4.0    |    4.5    |
|    1     |    1715.0   |   3.0    |    2.25   |
|    1     |    1060.0   |   3.0    |    1.5    |
|    1     |    1780.0   |   3.0    |    1.0    |
|    1     |    1890.0   |   3.0    |    2.5    |
+----------+-------------+----------+-----------+
[21613 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [11]:
#predicted output = matrix mult of predicted weights (feature_matrix) & predicted output (output_array)
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [12]:
#derivative of regression cost function = 2 dot product of feature & error predictions
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors,feature)
    return(derivative)

In [13]:
#gradient descent function
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors,feature_matrix[:, i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivative**2
            # update the weight based on step size and derivative:
            #each feature weight by subtracting the step size times the derivative for that feature given the current weights
            weights[i] -= step_size * derivative 
            
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [14]:
#test function input
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1180.0   |
|    1     |    2570.0   |
|    1     |    770.0    |
|    1     |    1960.0   |
|    1     |    1680.0   |
|    1     |    5420.0   |
|    1     |    1715.0   |
|    1     |    1060.0   |
|    1     |    1780.0   |
|    1     |    1890.0   |
+----------+-------------+
[17384 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [15]:
#run test
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)

In [16]:
print simple_weights

[-46999.88716555    281.91211912]


In [17]:
print simple_feature_matrix

[[  1.00000000e+00   1.18000000e+03]
 [  1.00000000e+00   2.57000000e+03]
 [  1.00000000e+00   7.70000000e+02]
 ..., 
 [  1.00000000e+00   1.53000000e+03]
 [  1.00000000e+00   1.60000000e+03]
 [  1.00000000e+00   1.02000000e+03]]


In [18]:
#use the weights on the test data
simple_features = ['sqft_living']
my_output= 'price'
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1430.0   |
|    1     |    2950.0   |
|    1     |    1710.0   |
|    1     |    2320.0   |
|    1     |    1090.0   |
|    1     |    2620.0   |
|    1     |    4220.0   |
|    1     |    2250.0   |
|    1     |    1260.0   |
|    1     |    2750.0   |
+----------+-------------+
[4229 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [19]:
np.ma.round(a=281.91211912, decimals=1)

281.89999999999998

In [21]:
predictions = predict_outcome(test_simple_feature_matrix,simple_weights)

In [22]:
#predicted price for the 1st house
print predictions[0]

356134.443171


In [23]:
def compute_RSS(predictions, output):
     
    #residual
    residual = output - predictions

    # square up
    residual_squared = residual **2
    
    #sum of squared residuals
    RSS = residual_squared.sum()

    return(RSS)

In [24]:
RSS = compute_RSS(predictions,test_output)

In [25]:
print RSS

2.75400047593e+14


In [26]:
#2nd model
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

+----------+-------------+---------------+
| constant | sqft_living | sqft_living15 |
+----------+-------------+---------------+
|    1     |    1180.0   |     1340.0    |
|    1     |    2570.0   |     1690.0    |
|    1     |    770.0    |     2720.0    |
|    1     |    1960.0   |     1360.0    |
|    1     |    1680.0   |     1800.0    |
|    1     |    5420.0   |     4760.0    |
|    1     |    1715.0   |     2238.0    |
|    1     |    1060.0   |     1650.0    |
|    1     |    1780.0   |     1780.0    |
|    1     |    1890.0   |     2390.0    |
+----------+-------------+---------------+
[17384 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [28]:
#run test
model_2_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size,tolerance)

In [29]:
print model_2_weights

[ -9.99999688e+04   2.45072603e+02   6.52795277e+01]


In [30]:
(test_model_feature_matrix, test__model_output) = get_numpy_data(test_data, model_features, my_output)

+----------+-------------+---------------+
| constant | sqft_living | sqft_living15 |
+----------+-------------+---------------+
|    1     |    1430.0   |     1780.0    |
|    1     |    2950.0   |     2140.0    |
|    1     |    1710.0   |     1030.0    |
|    1     |    2320.0   |     2580.0    |
|    1     |    1090.0   |     1570.0    |
|    1     |    2620.0   |     2620.0    |
|    1     |    4220.0   |     2410.0    |
|    1     |    2250.0   |     2250.0    |
|    1     |    1260.0   |     1290.0    |
|    1     |    2750.0   |     1510.0    |
+----------+-------------+---------------+
[4229 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [31]:
predictions = predict_outcome(test_model_feature_matrix,model_2_weights)

In [33]:
print predictions[0]

366651.412037


In [34]:
RSS = compute_RSS(predictions,test__model_output)

In [35]:
print RSS

2.70263446465e+14


In [36]:
print test_data[0]['price']

310000.0


In [None]:
2.75400047593e+14