In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
kc_house=pd.read_csv('kc_house_data.csv',dtype=dtype_dict)
kc_house_train=pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
kc_house_test=pd.read_csv('kc_house_test_data.csv',dtype=dtype_dict)
kc_house_train.head(1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0


In [2]:
#Function to convert a frame to matrix 
def get_numpy_data(df,features,output):
    X=pd.DataFrame() # A new empty dataframe
    X=df[features].copy() # Copy all the data for the selected features in the new dataframe
    X['constant']=1 # A new column for the constant/intercept term
    cols=X.columns.tolist() # Converting the column names to a list
    cols=cols[-1:]+cols[:-1] # Changing the order of columns making the constant column as the first column followed by other features
    X=X[cols]
    features_matrix=np.matrix(X) # Creating a matrix for the features
    output_array=np.array(df[output]) # Array for the ouptut/target
    return features_matrix, output_array

In [3]:
def predict_outcome(f, w):
    # This fucntion will multiply the features by its weights
    # So if there are D features, a weight is assigned to each feature
    pred_out=f*w
    return pred_out

def feature_derivative(errors, n):
    # The multiplication of error calculated as (predictions-output) 
    # and the column for the feature whose weight is being updated
    derivative=n.transpose()*errors
    return 2*derivative # as per the formula of gradient descent

def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    weights=weights.reshape(len(weights),1) # As sklearn accepts an array of the shape(D,1)
    z=feature_matrix.copy()
    t=0
    # While the magnitude of the Gradient descent sum square root is not smaller than the tolerance specified
    while not converged:
        predictions=predict_outcome(feature_matrix,weights) # Predict the outcomes
        error=predictions-output # Calculate the error
        gradient_sum_squares=0
        for i in range(len(weights)): # For each feature update the weights
            derivative=feature_derivative(error,z[:,i]) # The partial derivative
            gradient_sum_squares+=np.sum(np.square(derivative)) 
            weights[i]-=step_size*np.sum(derivative) # Update the weights
        gradient_magnitude = np.sqrt(gradient_sum_squares) #Update the magnitude
        if gradient_magnitude < tolerance: # Check if the solution has converged
            converged = True
            t+=1
    return(weights)

In [4]:
#Sample Linear Regression using a sinngle feature as sqft_living and target as Price

simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(kc_house_train, simple_features, my_output)
output=output.reshape(len(output),1)
initial_weights = np.array([-47000., 1.])

step_size = 7e-12
tolerance = 2.5e7
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)

In [6]:
#Converting the test data to a numpy matrix
test_simple_feature_matrix,output_test=get_numpy_data(kc_house_test, simple_features, my_output)

In [8]:
# Predicting the price of the houses based on the weights calaculated using the simple weights
# Price= constant + weight for sqft_living * values
test_price=simple_weights[0]+test_simple_feature_matrix[:,1]*simple_weights[1]
# The Residual Sum of squares
print "Residual Sum of Square is", np.sum(np.square(np.array(test_price)-np.array(kc_house_test['price'])))

Residual Sum of Square is 3.43636577505e+18


In [10]:
#Predicted price of the first house in the test set
kc_house_test[0:1]['sqft_living'][0]*simple_weights[1]+simple_weights[0]

array([ 356134.443255])

In [11]:
simple_weights[1] # Weights/ coefficients for the sqft living featutre

array([ 281.91211918])

In [14]:
kc_house_test[0:1] # Real values of the test data house 1


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,114101516,20140528T000000,310000.0,3.0,1.0,1430.0,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780.0,12697.0


In [15]:
#Sample Linear Regression using multiple features: sqft_living and sqft_living15  and target as Price
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, my_output) = get_numpy_data(kc_house_train, model_features,my_output)
my_output=my_output.reshape(len(my_output),1)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
simple_weights = regression_gradient_descent(feature_matrix, my_output,initial_weights, step_size,tolerance)

In [17]:
print "The co-efficient of the sqft_living15 feature is",simple_weights[2] # Weight for the sqft_living15 feature

The co-efficient of the sqft_living15 feature is [ 65.2795267]


In [18]:
##Predicted price of the first house in the test set using the multiple features model
kc_house_test[0:1]['sqft_living15']*simple_weights[2]+kc_house_test[0:1]['sqft_living'][0]*simple_weights[1]+simple_weights[0]

0    366651.411629
Name: sqft_living15, dtype: float64

In [19]:
# Real values of the test data house 1
kc_house_test[0:1]['price']

0    310000.0
Name: price, dtype: float64

In [20]:
test_simple_feature_matrix,output_test=get_numpy_data(kc_house_test, model_features, 'price')

# Predicting the price of the houses based on the weights calaculated using the weights
# Price= constant + weight for sqft_living * data[sqft_living] +weights or sqft_living15 * data[sqft_living15]

test_price=simple_weights[0]+test_simple_feature_matrix[:,1]*simple_weights[1]+test_simple_feature_matrix[:,2]*simple_weights[2]
print "Residual Sum of Square is", np.sum(np.square(np.array(test_price)-np.array(kc_house_test['price'])))

Residual Sum of Square is 3.45411823284e+18


## Thank You!