In [1]:
import turicreate as tc
import math

In [2]:
houses = tc.SFrame('../ml-foundations/data/home_data.sframe')
# the relevant dataset is in a sibling folder from past course

2. Split data into 80% and 20% using a random seed of 0.

In [3]:
train, test = houses.random_split(0.8, seed=0)

3. Write a generic function that receives an SArray of an input X and an SArray of labels Y and returns the optimal intercept and slope for the simple linear regression model, using the Closed Form Solution.

In [4]:
# Closed Form version
def simple_linear_regression(x_vector, y_vector):
    N = len(x_vector)
    if len(y_vector) != N: raise ValueError
    
    sumY = sum(y_vector)
    sumX = sum(x_vector)
    
    sumX2 = sum([x*x for x in x_vector])
    
    sumXY = 0
    for i in range(N):
        sumXY += (x_vector[i]*y_vector[i])
    
    slope = (sumXY - (sumY*sumX)/N) / (sumX2 - (sumX*sumX)/N)
    intercept = sumY/N - slope*(sumX/N)
    
    return (intercept, slope)

In [5]:
# Example Test
x = [0, 1, 2, 3, 4]
y = [1, 3, 7, 13, 21]

w0, w1 = simple_linear_regression(x,y)  # -1.0, 5.0

4. With your function, calculate the slope and intercept parameters for input `sqft_living` and target `price`

In [6]:
x_vector = train['sqft_living']
y_vector = train['price']

In [8]:
intercept, slope = simple_linear_regression(x_vector, y_vector)
print(intercept, slope)

-47116.07657494 281.9588385676974


5. Write a function that receives an input feature column and returns a column of precitions for each entry in the input.

In [19]:
def get_regression_prediction(input_vector, w0, w1):
    result = [w0+w1*x for x in input_vector]
    return tc.SArray(result)

6) What is the predicted price for a house with 2,650 square feet?

In [42]:
r = get_regression_prediction([2650], intercept, slope)
print(r)

[700074.8456294581]


7. Write a function that receives an input and an output vectors, their parameters of a simple linear regression and returns the Residual Sum of Squares.

In [24]:

def get_RSS(x_vector, y_vector, w0, w1):
    if len(x_vector) != len(y_vector): raise ValueError
        
    rss = 0.0
    for i in range(len(x_vector)):
        y_est = w0 + w1*x_vector[i]
        sq_residual = math.pow((y_vector[i] - y_est),2)
        rss += sq_residual
    
    return rss      
    
    

8. What is the RSS for the slr using squarefeet to predict the prices of our TRAINING dataset?

In [44]:
train_rss = get_RSS(train['sqft_living'], train['price'], intercept, slope)

print("Training RSS:", "{:e}".format(train_rss))

Training RSS: 1.201918e+15


9. Inverting the regression: obtain an estimate of X given an input of Y.

Write a function that accepts an output vector and returns the estimated input vector.

In [28]:
def inverse_regression_prediction(y_vector, w0, w1):
    result = [(y-w0)/w1 for y in y_vector]
    
    return tc.SArray(result)

10. According to this function and the `intercept` and `slope` obtained from the training data, what is the estimated area of a house that costs $800,000?

In [31]:
e = inverse_regression_prediction([800000], intercept, slope)
print(e)

[3004.396247615945]


11. Now use `bedrooms` to calculate the parameters of a simple linear regression that uses this feature, and save these to another two variables.

In [32]:
bed_intercept, bed_slope = simple_linear_regression(train['bedrooms'], train['price'])

In [33]:
bed_intercept

109473.18046928808

In [34]:
bed_slope

127588.95217458377

12. Finally, compute the RSS for both models, `sqft_living` and `bedrooms`, using the TRAINING data.

In [35]:
sqft_rss = get_RSS(test['sqft_living'], test['price'], intercept, slope)
bed_rss = get_RSS(test['bedrooms'], test['price'], bed_intercept, bed_slope)

In [36]:
sqft_rss

275402936247141.8

In [37]:
bed_rss

493364582868287.4

In [39]:
sqft_rss < bed_rss    # square feet is a better predictor than no. bedrooms

True