In [1]:
import pandas as pd
import numpy as np
from math import log
# import regression tools
import sys
sys.path.append(r'../')
import RegressionTools as reg

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 
              'sqft_lot':int, 'view':int}

In [3]:
# load all csv files
kc_house_train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
kc_house_test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [4]:
# verify function
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
kc_house_train_data['example_price'] = 3.0 * kc_house_train_data['sqft_living'] + 2.0 * kc_house_train_data['bedrooms'] + 1.0 * kc_house_train_data['bathrooms']
example_coeffs = reg.multiple_linear_regression(kc_house_train_data, example_features, 'example_price')
example_coeffs

array([ 3.,  2.,  1.])

In [5]:
example_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, example_features, 'example_price', example_coeffs)
example_RSS

1.9307037854432102e-19

In [6]:
sample_coeffs = reg.multiple_linear_regression(kc_house_train_data, example_features, 'price')
sample_coeffs

array([   308.93552416, -44362.93724549,  19283.82752088])

In [7]:
# given example
sample_coeffs = reg.multiple_linear_regression(kc_house_train_data, example_features, 'price')
sample_feature_matrix = kc_house_train_data[example_features[0]][0]
sample_feature_matrix = np.vstack((sample_feature_matrix, kc_house_train_data[example_features[1]][0]))
sample_feature_matrix = np.vstack((sample_feature_matrix, kc_house_train_data[example_features[2]][0]))
print('predicted house price is %.6f ' % sample_feature_matrix.T.dot(sample_coeffs))

NameError: name 'multiple_linear_regression' is not defined

## Programming assignment 1: Multiple Regression Models

### Create new features

In [None]:
kc_house_train_data['bedrooms_squared'] = kc_house_train_data['bedrooms'].apply(lambda x: x**2)
kc_house_train_data['bed_bath_rooms']   = kc_house_train_data['bedrooms'] * kc_house_train_data['bathrooms']
kc_house_train_data['log_sqft_living']  = kc_house_train_data['sqft_living'].apply(lambda x: log(x))
kc_house_train_data['lat_plus_long']    = kc_house_train_data['lat'] + kc_house_train_data['long']

In [None]:
kc_house_test_data['bedrooms_squared'] = kc_house_test_data['bedrooms'].apply(lambda x: x**2)
kc_house_test_data['bed_bath_rooms']   = kc_house_test_data['bedrooms'] * kc_house_test_data['bathrooms']
kc_house_test_data['log_sqft_living']  = kc_house_test_data['sqft_living'].apply(lambda x: log(x))
kc_house_test_data['lat_plus_long']    = kc_house_test_data['lat'] + kc_house_test_data['long']

In [None]:
print('bedrooms_squared mean: %.2f' % kc_house_test_data['bedrooms_squared'].mean())
print('bed_bath_rooms mean:   %.2f' % kc_house_test_data['bed_bath_rooms'].mean())
print('log_sqft_living mean:  %.2f' % kc_house_test_data['log_sqft_living'].mean())
print('lat_plus_long mean:    %.2f' % kc_house_test_data['lat_plus_long'].mean())

### Learning multiple models

In [None]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [None]:
model_1_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_1_features, 'price')
pd.DataFrame(model_1_coeffs, model_1_features)

In [None]:
model_2_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_2_features, 'price')
pd.DataFrame(model_2_coeffs, model_2_features)

In [None]:
model_3_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_3_features, 'price')
pd.DataFrame(model_3_coeffs, model_3_features)

### Comparing multiple models

In [None]:
# on training data
model_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_1_features, 'price', model_1_coeffs)
model_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_2_features, 'price', model_2_coeffs)
model_3_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_3_features, 'price', model_3_coeffs)
print('RSS results: \nmodel 1 = %.4E, model 2 = %.4E, model 3 = %.4E' % (model_1_RSS, model_2_RSS, model_3_RSS))

In [None]:
# on training data
model_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_1_features, 'price', model_1_coeffs)
model_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_2_features, 'price', model_2_coeffs)
model_3_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_3_features, 'price', model_3_coeffs)
print('RSS results: \nmodel 1 = %.4E, model 2 = %.4E, model 3 = %.4E' % (model_1_RSS, model_2_RSS, model_3_RSS))

##  Programming assignment 2: Gradient Descent

In [None]:
def get_numpy_data(data, input_features, ouput_feature):
    N = len(data[input_features[0]])
    data['constant'] = np.ones(data[input_features[0]].shape) # add a constant column
    features_matrix = data['constant'].reshape(N,1)
    for i in range(0, len(input_features)):
        features_matrix = np.hstack((features_matrix, data[input_features[i]].reshape(N,1)))
    output_vector = np.array(data[ouput_feature])
    return np.array(features_matrix), output_vector
#     data['constant'] = np.ones(data[input_features[0]].shape) # add a constant column
#     features_matrix = data['constant']
#     for i in range(0, len(input_features)):
#         features_matrix = np.vstack((features_matrix, data[input_features[i]]))
#     features_matrix = np.array(features_matrix)
#     output_vector = np.array(data[ouput_feature])
#     return(features_matrix, output_vector)

def predict_outcome(feature_matrix, weights):
    if np.any(np.array(feature_matrix.shape) > 1):
        return feature_matrix.dot(weights)
    return feature_matrix * weights
    
def regression_gradient_descent(features_matrix, output, initial_weights, step_size, tolerance, max_iter):
    converged = False
    weights = np.array(initial_weights)
    iterator = 0
    while not converged:
        gradient_rss = -2.0 * features_matrix.T.dot(output - predict_outcome(features_matrix, weights))
        gradient_magnitude = np.sqrt((gradient_rss ** 2).sum())
        weights = weights - step_size * gradient_rss
        if iterator % 10 == 0:
            print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
        iterator = iterator + 1
        if gradient_magnitude < tolerance:
            converged = True
        if iterator > max_iter:
            converged = True
    
    print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
    return(weights)
#     converged = False
#     weights = np.array(initial_weights)
#     iterator = 0
#     while not converged:
#         gradient_rss = -2.0 * features_matrix.dot(output - weights.dot(features_matrix))
#         gradient_magnitude = np.sqrt((gradient_rss ** 2).sum())
#         weights = weights - step_size * gradient_rss
#         if iterator % 10 == 0:
#             print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
#         iterator = iterator + 1
#         if gradient_magnitude < tolerance:
#             converged = True
#         if iterator > max_iter:
#             converged = True
    
#     print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
#     return(weights)

#### Simple 1D Model

In [None]:
simple_features = ['sqft_living']
my_output= 'price'
simple_feature_matrix, output = get_numpy_data(kc_house_train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
max_iter = 500

In [None]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance, max_iter)

In [None]:
print('The value of the weight for sqft_living is %.1f' % simple_weights[1])

In [None]:
test_simple_feature_matrix, test_output = get_numpy_data(kc_house_test_data, simple_features, my_output)
predicted_price = predict_outcome(test_simple_feature_matrix[0], simple_weights)
print('The predicted price for the 1st house in the Test data set for model 1 is %d' % predicted_price)

In [None]:
test_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, ['constant', 'sqft_living'], 'price', simple_weights)
print('RSS on all test data for this model is %.4E' % test_1_RSS)

#### 2D Model

In [None]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
feature_matrix, output = get_numpy_data(kc_house_train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
max_iter = 500

In [None]:
feature_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, max_iter)

In [None]:
pd.DataFrame(feature_weights, ['constant'] + model_features)

In [None]:
test_multiple_feature_matrix, test_output = get_numpy_data(kc_house_test_data, model_features, my_output)
predicted_price = predict_outcome(test_multiple_feature_matrix[0], feature_weights)
print('The predicted price for the 1st house in the Test data set for model 1 is %d' % predicted_price)
print('The actual price for the 1st house in the Test data set is %d' % kc_house_test_data['price'][0])

In [None]:
test_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, ['constant', 'sqft_living', 'sqft_living15'], 'price', feature_weights)
print('RSS on all test data for this model is %.4E' % test_2_RSS)