In [1]:
import graphlab as gl

In [2]:
sales = gl.SFrame('kc_house_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1471132385.log


This non-commercial license of GraphLab Create for academic use is assigned to minggong@thoughtworks.com and will expire on July 06, 2017.


In [53]:
train_data, test_data = sales.random_split(.8, seed=0)

In [54]:
import numpy as numpy


In [55]:
def simple_linear_regression(input_feature, target):
    input_feature = gl.numpy.array(input_feature)
    target = gl.numpy.array(target)
    sum_x = numpy.sum(input_feature)
    sum_y = numpy.sum(target)
    sum_xy= numpy.sum(numpy.multiply(input_feature, target))
    sum_xx = numpy.sum(numpy.square(input_feature))
    N = len(input_feature)
    slope = (sum_xy - sum_x * sum_y * 1.0 / N) / (sum_xx - sum_x * sum_x * 1.0 / N)
    intercept = (sum_y - slope * sum_x) * 1.0 / N
    return (slope, intercept)

(slope, intercept) = simple_linear_regression(train_data['sqft_living'], train_data['price'])

In [56]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = []
    for sqft in input_feature:
        predicted_output.append(intercept + slope * sqft )
    return gl.SArray(data=predicted_output, dtype=float)


In [57]:
slope * 2650 + intercept

700074.84562945808

In [58]:
def get_residual_sum_of_squares(input_feature, output, intecept, slope):
    predicted_output = get_regression_predictions(input_feature, intercept, slope)
    predicted_output = gl.numpy.array(predicted_output)
    output = gl.numpy.array(output)
    print(predicted_output)
    print(output)
    return numpy.sum(numpy.square(numpy.subtract(output, predicted_output)))

get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], intercept, slope)

[ 285595.35293494  677518.13854404  169992.22912219 ...,  384280.94643364
  404018.06513338  240481.93876411]
[ 221900.  538000.  180000. ...,  360000.  400000.  325000.]


1201918356321967.5

In [76]:
def inverse_regression_predictions(output, intercept, slope):
    return (output - intercept) * 1.0 / slope

inverse_regression_predictions(800000, intercept, slope)

3004.396247615945

# When using bedroom as features

In [60]:
(slope_room, intercept_room) = simple_linear_regression(train_data['bedrooms'], train_data['price'])

In [69]:
get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], intercept, slope)

[ 356085.06257687  784662.49719977  435033.53737582 ...,  663420.19661566
  604208.84051644  240481.93876411]
[ 310000.  650000.  233000. ...,  610685.  400000.  402101.]


275402936247141.38

In [62]:
get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], intercept_room, slope_room)

[ 335650.77994881  463239.7321234   335650.77994881 ...,  463239.7321234
  463239.7321234   208061.82777423]
[ 310000.  650000.  233000. ...,  610685.  400000.  402101.]


600666940264759.38

# Check the right answer

In [51]:
def check_closed_form(input_feature, target):
    sum_x = numpy.sum(input_feature)
    sum_y = numpy.sum(target)
    sum_xy= numpy.sum(numpy.multiply(input_feature, target))
    sum_xx = numpy.sum(numpy.square(input_feature))
    N = len(input_feature)
    slope = (sum_xy - sum_x * sum_y * 1.0 / N) / (sum_xx - sum_x * sum_x * 1.0 / N)
    intercept = (sum_y - slope * sum_x) * 1.0 / N
    return (slope, intercept)

In [52]:
check_closed_form([0, 1, 2, 3, 4], [1, 3, 7, 13, 21])

(5.0, -1.0)

In [63]:
import matplotlib.pyplot as plt
%matplotlib inline

In [65]:
standard_model = gl.linear_regression.create(train_data, target='price', features=['sqft_living'], validation_set=None, verbose=False)

In [66]:
standard_model.get('coefficients')

name,index,value,stderr
(intercept),,-47114.0206702,4923.34437753
sqft_living,,281.957850166,2.16405465323


In [67]:
print(intercept)

-47116.0765749


In [68]:
print(slope)

281.958838568


In [71]:
281.957850166 * 2650 - 47114.0206702

700074.2822697001

In [75]:
(800000 - intercept) * 1.0 / slope

3004.396247615945

In [73]:
inverse_regression_predictions(8000000, intercept, slope)

28540.038388060155

In [None]:
(output - intercept) * 1.0 / slope