In [2]:
import graphlab

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/')

2016-04-25 08:13:44,129 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1461597222.log


This non-commercial license of GraphLab Create is assigned to davidshimer@yahoo.com and will expire on March 11, 2017. For commercial licensing options, visit https://dato.com/buy/.


In [4]:
from math import log
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
def get_numpy_data(data_sframe, features, output):
    #data_sframe['constant'] = 1 # this is how you add a constant column to an SFrame
    ## add the column 'constant' to the front of the features list so that we can extract it along with the others:
    #features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_sframe = data_sframe[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_sarray = data_sframe[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

In [6]:
def polynomial_sframe(feature, degree):
    # assume that degree >= 1
    # initialize the SFrame:
    poly_sframe = graphlab.SFrame()
    # and set poly_sframe['power_1'] equal to the passed feature
    poly_sframe['power_1'] = feature
    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree
        for power in range(2, degree+1): 
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # then assign poly_sframe[name] to the appropriate power of feature
            poly_sframe[name] = feature.apply(lambda x: x**power)
    return poly_sframe

In [7]:
training_and_validation,testing = sales.random_split(0.9,seed=1)
training,validation = training_and_validation.random_split(0.9,seed=1)

In [14]:
RSS = {}
for degree in range(1,15+1):
    poly_data_training = polynomial_sframe(training['sqft_living'], degree)
    my_features = poly_data_training.column_names() # get the name of the features
    #poly_data_training['price'] = training['price'] # add price to the data since it's the target
    #modeltrain = graphlab.linear_regression.create(poly_data_training, target = 'price',
    #                                               features = my_features, validation_set = None, verbose=False)
    nump_training_input = poly_data_training.to_numpy()
    nump_training_output = training['price'].to_numpy()
    # now we train our models scikit style
    regr = linear_model.LinearRegression()
    regr.fit(nump_training_input,nump_training_output)
    print('Intercept: \n', regr.intercept_)
    print('Coefficients: \n',  regr.coef_)
    print('------------------\n')
    # The mean square error
    #print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
    
    poly_data_validation = polynomial_sframe(validation['sqft_living'], degree)
    nump_validation_input = poly_data_validation.to_numpy()
    #poly_data_validation['price'] = validation['price'] # add price to the validation data since it's the target
    #predictions = modeltrain.predict(poly_data_validation)
    nump_validation_output = validation['price'].to_numpy()
    outcome = nump_validation_output
    predictions = regr.predict(nump_validation_input)
    # Then compute the residuals/errors
    residuals = outcome - predictions
    # Then square and add them up
    RSS[degree] = (residuals * residuals).sum()
print RSS

('Intercept: \n', -43648.437076913775)
('Coefficients: \n', array([ 280.34257309]))
------------------

('Intercept: \n', 199969.60757991346)
('Coefficients: \n', array([  6.79163942e+01,   3.82958833e-02]))
------------------

('Intercept: \n', 341727.7549447451)
('Coefficients: \n', array([ -9.39567250e+01,   8.72560813e-02,  -3.77574108e-06]))
------------------

('Intercept: \n', 153716.16264044226)
('Coefficients: \n', array([  1.81569860e+02,  -3.56241665e-02,   1.58511533e-05,
        -9.30235527e-10]))
------------------

('Intercept: \n', 232910.59797942854)
('Coefficients: \n', array([  5.19814759e-05,   9.47042578e-02,  -2.14340680e-05,
         3.37749018e-09,  -1.64007696e-13]))
------------------

('Intercept: \n', 294086.43232705421)
('Coefficients: \n', array([  7.77215999e-12,   6.37651942e-08,   4.05896508e-05,
        -1.11676818e-08,   1.22866796e-12,  -4.58430642e-17]))
------------------

('Intercept: \n', 356648.80875012686)
('Coefficients: \n', array([  5.784694

In [9]:
min(RSS.values())

126499021517032.84