In [1]:
import pandas as pd
import numpy as np
from math import log
# import regression tools
import sys
sys.path.append(r'../')
import RegressionTools as reg

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 
              'sqft_lot':int, 'view':int}

In [3]:
# load all csv files
kc_house_train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
kc_house_test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [4]:
# verify function
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
kc_house_train_data['example_price'] = 3.0 * kc_house_train_data['sqft_living'] + 2.0 * kc_house_train_data['bedrooms'] + 1.0 * kc_house_train_data['bathrooms']
example_coeffs = reg.multiple_linear_regression(kc_house_train_data, example_features, 'example_price')
example_coeffs

array([ 3.,  2.,  1.])

In [5]:
example_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, example_features, 'example_price', example_coeffs)
example_RSS

1.9307037854432102e-19

In [6]:
sample_coeffs = reg.multiple_linear_regression(kc_house_train_data, example_features, 'price')
sample_coeffs

array([   308.93552416, -44362.93724549,  19283.82752088])

In [7]:
# given example
sample_coeffs = reg.multiple_linear_regression(kc_house_train_data, example_features, 'price')
sample_feature_matrix = kc_house_train_data[example_features[0]][0]
sample_feature_matrix = np.vstack((sample_feature_matrix, kc_house_train_data[example_features[1]][0]))
sample_feature_matrix = np.vstack((sample_feature_matrix, kc_house_train_data[example_features[2]][0]))
print('predicted house price is %.6f ' % sample_feature_matrix.T.dot(sample_coeffs))

predicted house price is 250738.934291 


## Programming assignment 1: Multiple Regression Models

### Create new features

In [8]:
kc_house_train_data['bedrooms_squared'] = kc_house_train_data['bedrooms'].apply(lambda x: x**2)
kc_house_train_data['bed_bath_rooms']   = kc_house_train_data['bedrooms'] * kc_house_train_data['bathrooms']
kc_house_train_data['log_sqft_living']  = kc_house_train_data['sqft_living'].apply(lambda x: log(x))
kc_house_train_data['lat_plus_long']    = kc_house_train_data['lat'] + kc_house_train_data['long']

In [9]:
kc_house_test_data['bedrooms_squared'] = kc_house_test_data['bedrooms'].apply(lambda x: x**2)
kc_house_test_data['bed_bath_rooms']   = kc_house_test_data['bedrooms'] * kc_house_test_data['bathrooms']
kc_house_test_data['log_sqft_living']  = kc_house_test_data['sqft_living'].apply(lambda x: log(x))
kc_house_test_data['lat_plus_long']    = kc_house_test_data['lat'] + kc_house_test_data['long']

In [10]:
print('bedrooms_squared mean: %.2f' % kc_house_test_data['bedrooms_squared'].mean())
print('bed_bath_rooms mean:   %.2f' % kc_house_test_data['bed_bath_rooms'].mean())
print('log_sqft_living mean:  %.2f' % kc_house_test_data['log_sqft_living'].mean())
print('lat_plus_long mean:    %.2f' % kc_house_test_data['lat_plus_long'].mean())

bedrooms_squared mean: 12.45
bed_bath_rooms mean:   7.50
log_sqft_living mean:  7.55
lat_plus_long mean:    -74.65


### Learning multiple models

In [11]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [12]:
model_1_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_1_features, 'price')
pd.DataFrame(model_1_coeffs, model_1_features)

Unnamed: 0,0
sqft_living,300.963659
bedrooms,-59554.887988
bathrooms,5321.077302
lat,532596.984132
long,206418.679603


In [13]:
model_2_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_2_features, 'price')
pd.DataFrame(model_2_coeffs, model_2_features)

Unnamed: 0,0
sqft_living,293.999642
bedrooms,-130492.300057
bathrooms,-109046.39602
lat,532930.482043
long,204561.517539
bed_bath_rooms,33689.673259


In [14]:
model_3_coeffs = reg.multiple_linear_regression(kc_house_train_data, model_3_features, 'price')
pd.DataFrame(model_3_coeffs, model_3_features)

Unnamed: 0,0
sqft_living,646.075415
bedrooms,-30922.613674
bathrooms,140385.970983
lat,455709.54081
long,95594.4595
bed_bath_rooms,-11454.291836
bedrooms_squared,-2309.203673
log_sqft_living,-633873.612071
lat_plus_long,81126.689341


### Comparing multiple models

In [15]:
# on training data
model_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_1_features, 'price', model_1_coeffs)
model_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_2_features, 'price', model_2_coeffs)
model_3_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_train_data, model_3_features, 'price', model_3_coeffs)
print('RSS results: \nmodel 1 = %.4E, model 2 = %.4E, model 3 = %.4E' % (model_1_RSS, model_2_RSS, model_3_RSS))

RSS results: 
model 1 = 1.0658E+15, model 2 = 1.0493E+15, model 3 = 1.2315E+15


In [16]:
# on training data
model_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_1_features, 'price', model_1_coeffs)
model_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_2_features, 'price', model_2_coeffs)
model_3_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, model_3_features, 'price', model_3_coeffs)
print('RSS results: \nmodel 1 = %.4E, model 2 = %.4E, model 3 = %.4E' % (model_1_RSS, model_2_RSS, model_3_RSS))

RSS results: 
model 1 = 2.5046E+14, model 2 = 2.4657E+14, model 3 = 3.0765E+14


##  Programming assignment 2: Gradient Descent

In [17]:
def get_numpy_data(data, input_features, ouput_feature):
    N = len(data[input_features[0]])
    data['constant'] = np.ones(data[input_features[0]].shape) # add a constant column
    features_matrix = data['constant'].reshape(N,1)
    for i in range(0, len(input_features)):
        features_matrix = np.hstack((features_matrix, data[input_features[i]].reshape(N,1)))
    output_vector = np.array(data[ouput_feature])
    return np.array(features_matrix), output_vector
#     data['constant'] = np.ones(data[input_features[0]].shape) # add a constant column
#     features_matrix = data['constant']
#     for i in range(0, len(input_features)):
#         features_matrix = np.vstack((features_matrix, data[input_features[i]]))
#     features_matrix = np.array(features_matrix)
#     output_vector = np.array(data[ouput_feature])
#     return(features_matrix, output_vector)

def predict_outcome(feature_matrix, weights):
    if np.any(np.array(feature_matrix.shape) > 1):
        return feature_matrix.dot(weights)
    return feature_matrix * weights
    
def regression_gradient_descent(features_matrix, output, initial_weights, step_size, tolerance, max_iter):
    converged = False
    weights = np.array(initial_weights)
    iterator = 0
    while not converged:
        gradient_rss = -2.0 * features_matrix.T.dot(output - predict_outcome(features_matrix, weights))
        gradient_magnitude = np.sqrt((gradient_rss ** 2).sum())
        weights = weights - step_size * gradient_rss
        if iterator % 10 == 0:
            print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
        iterator = iterator + 1
        if gradient_magnitude < tolerance:
            converged = True
        if iterator > max_iter:
            converged = True
    
    print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
    return(weights)
#     converged = False
#     weights = np.array(initial_weights)
#     iterator = 0
#     while not converged:
#         gradient_rss = -2.0 * features_matrix.dot(output - weights.dot(features_matrix))
#         gradient_magnitude = np.sqrt((gradient_rss ** 2).sum())
#         weights = weights - step_size * gradient_rss
#         if iterator % 10 == 0:
#             print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
#         iterator = iterator + 1
#         if gradient_magnitude < tolerance:
#             converged = True
#         if iterator > max_iter:
#             converged = True
    
#     print('%4d gradient magnitude is: %.5E' % (iterator, gradient_magnitude))
#     return(weights)

#### Simple 1D Model

In [18]:
simple_features = ['sqft_living']
my_output= 'price'
simple_feature_matrix, output = get_numpy_data(kc_house_train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
max_iter = 500

In [19]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance, max_iter)

   0 gradient magnitude is: 5.05515E+13
  10 gradient magnitude is: 7.05041E+07
  12 gradient magnitude is: 1.83200E+07


In [20]:
print('The value of the weight for sqft_living is %.1f' % simple_weights[1])

The value of the weight for sqft_living is 281.9


In [21]:
test_simple_feature_matrix, test_output = get_numpy_data(kc_house_test_data, simple_features, my_output)
predicted_price = predict_outcome(test_simple_feature_matrix[0], simple_weights)
print('The predicted price for the 1st house in the Test data set for model 1 is %d' % predicted_price)

The predicted price for the 1st house in the Test data set for model 1 is 356134


In [22]:
test_1_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, ['constant', 'sqft_living'], 'price', simple_weights)
print('RSS on all test data for this model is %.4E' % test_1_RSS)

RSS on all test data for this model is 2.7540E+14


#### 2D Model

In [23]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
feature_matrix, output = get_numpy_data(kc_house_train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
max_iter = 500

In [24]:
feature_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, max_iter)

   0 gradient magnitude is: 7.30720E+13
  10 gradient magnitude is: 5.51539E+11
  20 gradient magnitude is: 4.33802E+11
  30 gradient magnitude is: 3.41199E+11
  40 gradient magnitude is: 2.68364E+11
  50 gradient magnitude is: 2.11076E+11
  60 gradient magnitude is: 1.66018E+11
  70 gradient magnitude is: 1.30578E+11
  80 gradient magnitude is: 1.02704E+11
  90 gradient magnitude is: 8.07799E+10
 100 gradient magnitude is: 6.35359E+10
 110 gradient magnitude is: 4.99729E+10
 120 gradient magnitude is: 3.93053E+10
 130 gradient magnitude is: 3.09148E+10
 140 gradient magnitude is: 2.43155E+10
 150 gradient magnitude is: 1.91249E+10
 160 gradient magnitude is: 1.50423E+10
 170 gradient magnitude is: 1.18312E+10
 180 gradient magnitude is: 9.30564E+09
 190 gradient magnitude is: 7.31918E+09
 200 gradient magnitude is: 5.75676E+09
 210 gradient magnitude is: 4.52787E+09
 220 gradient magnitude is: 3.56131E+09
 230 gradient magnitude is: 2.80108E+09
 240 gradient magnitude is: 2.20314E+09


In [25]:
pd.DataFrame(feature_weights, ['constant'] + model_features)

Unnamed: 0,0
constant,-99999.968849
sqft_living,245.072603
sqft_living15,65.279527


In [26]:
test_multiple_feature_matrix, test_output = get_numpy_data(kc_house_test_data, model_features, my_output)
predicted_price = predict_outcome(test_multiple_feature_matrix[0], feature_weights)
print('The predicted price for the 1st house in the Test data set for model 1 is %d' % predicted_price)
print('The actual price for the 1st house in the Test data set is %d' % kc_house_test_data['price'][0])

The predicted price for the 1st house in the Test data set for model 1 is 366651
The actual price for the 1st house in the Test data set is 310000


In [27]:
test_2_RSS = reg.get_residual_sum_of_squares_multiple_models(kc_house_test_data, ['constant', 'sqft_living', 'sqft_living15'], 'price', feature_weights)
print('RSS on all test data for this model is %.4E' % test_2_RSS)

RSS on all test data for this model is 2.7026E+14
