# Linear Regression - Gaio Scikit version

In [1]:
import numpy as np
import pandas as pd

In [2]:
#define the input data dictionary
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
#import the data
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [4]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [5]:
#generic function that accepts a column of data 
#‘input_feature’ and another column ‘output’ and returns 
#the Simple Linear Regression parameters ‘intercept’ and ‘slope’. 
def simple_linear_regression(input_feature, output):
    # compute the sum of input_feature and output
    num_inputs = len(input_feature)
    sum_input_feature = input_feature.sum()    
    sum_output = output.sum()    
    # compute the product of the output and the input_feature and its sum
    prod_I_O = input_feature * output
    sum_prodIO = prod_I_O.sum()        
    # compute the squared value of the input_feature and its sum
    sqrd_I = input_feature*input_feature
    sum_sqrd_I = sqrd_I.sum()  
    
    #numerator = (sum of X*Y) - (1/N)*((sum of X) * (sum of Y))
    numerator = sum_prodIO - (1/num_inputs) * (sum_input_feature *sum_output)
    print "numerator:", numerator 
    #denominator = (sum of X^2) - (1/N)*((sum of X) * (sum of X)) 
    denominator = sum_sqrd_I - (1/num_inputs) * (sum_input_feature * sum_input_feature)
    print "denominator: ", denominator
    # slope
    slope = numerator/denominator 
    #intercept = (mean of Y) - slope * (mean of X)
    intercept = output.mean() - slope * input_feature.mean()
    
    return(intercept, slope)


In [7]:
test_feature = np.array(range(5))
test_output = np.array(1 + 1*test_feature)
(test_intercept, test_slope) =  simple_linear_regression(test_feature, test_output)
print "Intercept: " + str(test_intercept)
print "Slope: " + str(test_slope)

numerator: 40
denominator:  30
Intercept: 1.0
Slope: 1


In [8]:
input_feature = train_data['sqft_living']
output = train_data['price' ]

In [9]:
(squarfeet_intercept, squarfeet_slope) =  simple_linear_regression(input_feature, output)
print "Intercept: " + str(squarfeet_intercept)
print "Slope: " + str(squarfeet_slope)

numerator: 2.36662568532e+13
denominator:  89977452623.0
Intercept: -7731.68623741
Slope: 263.024304015


In [10]:
def get_regression_predictions(input_feature, intercept, slope):
    # y = mx + q
    predicted_values = input_feature * slope + intercept
    
    return predicted_values

In [11]:
#use the function to predict house prices
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, squarfeet_intercept, squarfeet_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price)

The estimated price for a house with 2650 squarefeet is $689282.72


In [12]:
#compute RSS
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # predictions
    fitted_line = input_feature * slope + intercept
    # then compute the residuals , square them and add them up
    residuals = fitted_line - output
    sqrd_residuals = residuals * residuals
    RSS = sqrd_residuals.sum()
    return(RSS)

In [13]:
print get_residual_sum_of_squares(test_feature, test_output, test_intercept, test_slope) # should be 0.0

0.0


In [14]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], 
                                                 train_data['price'], squarfeet_intercept, squarfeet_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 1.20721191722e+15


In [15]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_feature= (output - intercept)/slope
    return estimated_feature

In [16]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_predictions(my_house_price, squarfeet_intercept, squarfeet_slope)
print "The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet)

The estimated squarefeet for a house worth $800000.00 is 3070


In [17]:
input_feature_b = train_data['bedrooms']
output = train_data['price']
(squarfeet_intercept_b, squarfeet_slope_b) =  simple_linear_regression(input_feature_b, output)
print "Intercept: " + str(squarfeet_intercept_b)
print "Slope: " + str(squarfeet_slope_b)

numerator: 33414715665.0
denominator:  211637.0
Intercept: 7388.31444995
Slope: 157886.927451


In [18]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['bedrooms'], 
                                                 train_data['price'], squarfeet_intercept_b, squarfeet_slope_b)
print 'The RSS of predicting Prices based on Bedrooms is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Bedrooms is : 2.15635612061e+15
