# Predicting house prices using k-nearest neighbors regression

In this notebook, you will implement k-nearest neighbors regression. You will:

Find the k-nearest neighbors of a given query input
Predict the output for the query input using the k-nearest neighbors
Choose the best value of k using a validation set

In [91]:
import pandas as pd 
import numpy as np

In [92]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)

In [93]:
def get_numpy_data(data_frame, features, output):

    features_matrix = np.array([np.ones(len(data_frame))] + [data_frame[feature] for feature in features])

    output_array = np.array(data_frame[output])
    
    return features_matrix.T, output_array

def normalize_features(features):
    
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    
    return normalized_features, norms

In [94]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = get_numpy_data(train, feature_list, 'price')
features_test, output_test = get_numpy_data(test, feature_list, 'price')
features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')

In [95]:
features_train, norms = normalize_features(features_train) # normalize training set features (columns)
features_test = features_test / norms # normalize test set by training set norms
features_valid = features_valid / norms # normalize validation set by training set norms

In [96]:
query = features_test[0]
print(query)
print(features_train[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [97]:
def euclideanDistance(v1, v2):
    return np.sqrt(np.sum((v1 - v2) ** 2))

# 1. What is the Euclidean distance between the query house and the 10th house of the training set? Enter your answer in American-style decimals (e.g. 0.044) rounded to 3 decimal places.

In [101]:
euclideanDistance(query, features_train[9])

0.059723593713980783

# 2. Among the first 10 training houses, which house is the closest to the query house? Enter the 0-based index of the closest house.

In [106]:
nearest = []
for i, v in enumerate(features_train[0:10]):
    nearest.append(euclideanDistance(query, v))

np.argmin(np.array(nearest))

8

In [109]:
def euclideanDistanceMatrix(v1, v2):
    return np.sqrt(np.sum((v1 - v2) ** 2, axis=1))

In [110]:
def compute_distances(features_instances, features_query):
    return euclideanDistanceMatrix(features_query, features_instances)

# 3. Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?

In [112]:
distances = compute_distances(features_train, features_test[2])
np.argmin(distances)

382

# 4. Take the query house to be third house of the test set (features_test[2]). What is the predicted value of the query house based on 1-nearest neighbor regression? Enter your answer in simple decimals without comma separators (e.g. 300000), rounded to nearest whole number.

In [114]:
train['price'][np.argmin(distances)]

249000.0

In [115]:
def k_nearest_neighbors(k, feature_train, features_query):
    return np.argsort(compute_distances(feature_train, features_query))[0:k]

# 5. Take the query house to be third house of the test set (features_test[2]). Which of the following is NOT part of the 4 training houses closest to the query house? (Note that all indices are 0-based.)



In [116]:
k_nearest_neighbors(4, features_train, features_test[2])

array([ 382, 1149, 4087, 3142])

In [117]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbours = k_nearest_neighbors(k, features_train, features_query)
    return np.mean(output_train[neighbours])

# 6. Take the query house to be third house of the test set (features_test[2]). Predict the value of the query house by the simple averaging method. Enter your answer in simple decimals without comma separators (e.g. 241242), rounded to nearest whole number.


In [78]:
predict_output_of_query(4, features_train, output_train, features_test[2])

413987.5

In [79]:
def predict_output(k, features_train, output_train, features_query):
    predictions = []
    for query in features_query:
        predictions.append(predict_output_of_query(k, features_train, output_train, query))
    
    return predictions

# 7. What is the predicted value of the house in this query set that has the lowest predicted value? Enter your answer in simple decimals without comma separators (e.g. 312000), rounded to nearest whole number.

In [122]:
predictions = predict_output(4, features_train, output_train, features_test[0:10])
print(np.argsort(predictions))

[6 2 1 8 9 3 7 5 4 0]


In [123]:
output_train[6]

229500.0

In [89]:
for k in range(1,16):
    predictions = predict_output(k, features_train, output_train, features_valid)
    print(k, rss(output_valid, predictions))

1 1.05453830252e+14
2 8.3445073504e+13
3 7.26920960192e+13
4 7.19467216521e+13
5 6.98465174197e+13
6 6.88995443532e+13
7 6.83419734501e+13
8 6.73616787355e+13
9 6.8372727959e+13
10 6.93350486686e+13
11 6.95238552156e+13
12 6.90499695872e+13
13 7.00112545083e+13
14 7.0908698869e+13
15 7.11069283859e+13


In [87]:
rss = lambda y, y_hat: np.dot((y - y_hat).T, (y - y_hat))

In [90]:
predictions = predict_output(8, features_train, output_train, features_test)
print(rss(output_test, predictions))

1.33118823552e+14
