In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import log, sqrt
%matplotlib inline
from sklearn import linear_model # using scikit-learn
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int,
'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float,
'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str,
'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data_small.csv',dtype=dtype_dict)
X_train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
X_test = pd.read_csv('kc_house_data_small_test.csv',dtype=dtype_dict)
X_val = pd.read_csv('kc_house_data_validation.csv',dtype=dtype_dict)

X_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [132]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',  
            'floors', 'waterfront', 'view',  
            'condition', 'grade', 'sqft_above', 'sqft_basement',
            'yr_built', 'yr_renovated', 'lat', 'long',  
            'sqft_living15', 'sqft_lot15']

In [133]:
def get_numpy_data(df, features, output):
    df['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_matrix = df[features].as_matrix()
    output_array = df[output].as_matrix()
    return(features_matrix, output_array)


In [134]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    normalize_features = feature_matrix / norms
    return normalize_features, norms

In [186]:
feature_train,y_train= get_numpy_data(X_train,features,'price')
feature_test,y_test= get_numpy_data(X_test,features,'price')
feature_val,y_val= get_numpy_data(X_val,features,'price')

norm_train, norms = normalize_features(feature_train)
norm_test = feature_test / norms
norm_val = feature_val/ norms

In [136]:
query_house = norm_test[0]
print(norm_train[9])

[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [137]:
def distance_calc (array_1, array_2):
    distance = np.sqrt(np.sum((array_1 - array_2)**2))
    return distance

In [138]:
# Quiz #1
distance_calc(norm_train[9],norm_test[0])

0.059723593713980783

In [139]:
for i in range(3):
    print(norm_train[i]-norm_test[0])


[  0.00000000e+00  -3.87821276e-03  -1.20498190e-02  -1.05552733e-02
   2.08673616e-04  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
   0.00000000e+00  -3.47633726e-03  -5.50336860e-03  -2.48168183e-02
  -1.63756198e-04   0.00000000e+00  -1.70254220e-05   1.29876855e-05
  -5.14364795e-03   6.69281453e-04]
[  0.00000000e+00  -3.87821276e-03  -4.51868214e-03  -2.26610387e-03
   7.19763456e-04   0.00000000e+00   0.00000000e+00  -5.10236549e-02
   0.00000000e+00  -3.47633726e-03   1.30705004e-03  -1.45830788e-02
  -1.91048898e-04   6.65082271e-02   4.23090220e-05   6.16364736e-06
  -2.89330197e-03   1.47606982e-03]
[  0.00000000e+00  -7.75642553e-03  -1.20498190e-02  -1.30002801e-02
   1.60518166e-03  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
   0.00000000e+00  -5.21450589e-03  -8.32384500e-03  -2.48168183e-02
  -3.13866046e-04   0.00000000e+00   4.70885840e-05   1.56292487e-05
   3.72914476e-03   1.64764925e-03]


In [140]:
print(norm_train[0:3] - norm_test[0])
# verify that vectorization works
results = norm_train[0:3] - norm_test[0]
print(results[0] - (norm_train[0]-norm_test[0]))
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print(results[1] - (norm_train[1]-norm_test[0]))
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print(results[2] - (norm_train[2]-norm_test[0]))
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[[  0.00000000e+00  -3.87821276e-03  -1.20498190e-02  -1.05552733e-02
    2.08673616e-04  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
    0.00000000e+00  -3.47633726e-03  -5.50336860e-03  -2.48168183e-02
   -1.63756198e-04   0.00000000e+00  -1.70254220e-05   1.29876855e-05
   -5.14364795e-03   6.69281453e-04]
 [  0.00000000e+00  -3.87821276e-03  -4.51868214e-03  -2.26610387e-03
    7.19763456e-04   0.00000000e+00   0.00000000e+00  -5.10236549e-02
    0.00000000e+00  -3.47633726e-03   1.30705004e-03  -1.45830788e-02
   -1.91048898e-04   6.65082271e-02   4.23090220e-05   6.16364736e-06
   -2.89330197e-03   1.47606982e-03]
 [  0.00000000e+00  -7.75642553e-03  -1.20498190e-02  -1.30002801e-02
    1.60518166e-03  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
    0.00000000e+00  -5.21450589e-03  -8.32384500e-03  -2.48168183e-02
   -3.13866046e-04   0.00000000e+00   4.70885840e-05   1.56292487e-05
    3.72914476e-03   1.64764925e-03]]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.

# Perform 1-nearest neighbor regression
Now that we have the element-wise differences, it is not too hard to compute the Euclidean distances between our query house and all of the training houses. First, write a single-line expression to define a variable diff such that diff[i] gives the element-wise difference between the features of the query house and the i-th training house.

In [141]:
diff = norm_train - query_house

In [142]:
# To test your code, print diff[-1].sum(), which should be -0.0934339605842.
diff[-1].sum()


-0.093433998746546426

### 13. The next step in computing the Euclidean distances is to take these feature-by-feature differences in ‘diff’, square each, and take the sum over feature indices. That is, compute the sum of squared feature differences for each training house (row in ‘diff’).

By default, ‘np.sum’ sums up everything in the matrix and returns a single number. To instead sum only over a row or column, we need to specifiy the ‘axis’ parameter described in the np.sum documentation. In particular, ‘axis=1’ computes the sum across each row.



In [143]:
np.sum(diff**2,axis =1)

array([ 0.00363304,  0.00730492,  0.00378218, ...,  0.0032681 ,
        0.00325555,  0.00341846])

### 14. With this result in mind, write a single-line expression to compute the Euclidean distances from the query to all the instances. Assign the result to variable distances.

Hint: don't forget to take the square root of the sum of squares.

Hint: distances[100] should contain 0.0237082324496.

In [144]:
distance = np.sqrt(np.sum(diff**2, axis=1))
distance[100]

0.023708232416678195

### 15. Now you are ready to write a function that computes the distances from a query house to all training houses. The function should take two parameters: (i) the matrix of training features and (ii) the single feature vector associated with the query.

In [145]:
def compute_distance (feature_instances, feature_query):
    distance = np.sqrt(np.sum((feature_instances - feature_query)**2,axis =1))
    return distance

In [208]:
# Quiz
result = compute_distance(norm_train[0:10],norm_test[0])
result.argmin(),result.min()

(8, 0.052383627840220305)

### 16. Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?
<br>
   <font color=red>Index 382</font>

### 17. Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?
<br>
   <font color=red>249000</font>

In [146]:
result = compute_distance(norm_train,norm_test[2])
result.argmin(),result.min(),y_train[382]

(382, 0.0028604955575117085, 249000.0)

## 18. Using the functions above, implement a function that takes in

- the value of k;
- the feature matrix for the instances; and
- the feature of the query

and returns the indices of the k closest training houses. For instance, with 2-nearest neighbor, a return value of [5, 10] would indicate that the 6th and 11th training houses are closest to the query house.

In [151]:
def k_nearest_neighbors(k, feature_train, feature_query):
    distance= compute_distance(feature_train,feature_query)
    return distance.argsort(axis =0)[0:k]

# Hint: you can extract multiple items from a numpy array using a list of indices. 
# For instance, output_train[[6,10]] returns the output values (prices) of the 7th and 11th instances.

### 19. Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [152]:
k_nearest_neighbors(4,norm_train,norm_test[2])

array([ 382, 1149, 4087, 3142])

## 20. Now that we know how to find the k-nearest neighbors, write a function that predicts the value of a given query house. For simplicity, take the average of the prices of the k nearest neighbors in the training set. The function should have the following parameters:

- the value of k;
- the feature matrix for the instances;
- the output values (prices) of the instances; and
- the feature of the query, whose price we’re predicting.
- The function should return a predicted value of the query house.

In [153]:
def predict_output_of_query(k, features_train, output_train, features_query):
    k_neighbors = k_nearest_neighbors(k,features_train,features_query)
    prediction = output_train[k_neighbors].mean()
#     prediction =np.mean(output_train[distance])
    return prediction

### 21. Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.

In [154]:
predict_output_of_query(4,norm_train,y_train,norm_test[2])

413987.5

In [155]:
norm_test[2:10].shape

(8, 18)

## 22. Finally, write a function to predict the value of each and every house in a query set. (The query set can be any subset of the dataset, be it the test set or validation set.) The idea is to have a loop where we take each house in the query set as the query house and make a prediction for that specific house. The new function should take the following parameters:

- the value of k;
- the feature matrix for the training set;
- the output values (prices) of the training houses; and
- the feature matrix for the query set.
- The function should return a set of predicted values, one for each house in the query set.

In [169]:
def predict_output(k, features_train, output_train, features_query):
    result = []
    for i in range(features_query.shape[0]):
        predict_price = predict_output_of_query(k, features_train, output_train, features_query[i])
        predictions = (features_query[i], predict_price)
        result.append(predict_price)

    return result

[413987.5,
 552750.0,
 869625.0,
 683237.5,
 332830.0,
 577500.0,
 436250.0,
 454975.0]

### 23. Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?

In [202]:
result = predict_output(10,norm_train,y_train,norm_test[0:10])
print(result)
print('Index: ',np.argmin(result),' Predicted Price: ', np.min(result))

[881300.0, 431860.0, 460595.0, 430200.0, 766750.0, 667420.0, 350032.0, 512800.70000000001, 484000.0, 457235.0]
Index:  6  Predicted Price:  350032.0


Choosing the best value of k using a validation set

## 24. There remains a question of choosing the value of k to use in making predictions. Here, we use a validation set to choose this value. Write a loop that does the following:

For k in [1, 2, … 15]:

Make predictions for the VALIDATION data using the k-nearest neighbors from the TRAINING data.
Compute the RSS on VALIDATION data
Report which k produced the lowest RSS on validation data.

<font color=red> k = 8</font>

In [201]:
result =[]
for k in range(1,16):
    prediction = predict_output(k,norm_train,y_train,norm_val)
    RSS = np.sum((prediction - y_val)**2)
    answer = (k, RSS)
    result.append(answer)             
result

[(1, 105453830251561.0),
 (2, 83445073504025.5),
 (3, 72692096019202.562),
 (4, 71946721652091.688),
 (5, 69846517419718.602),
 (6, 68899544353180.836),
 (7, 68341973450051.094),
 (8, 67361678735491.5),
 (9, 68372727958976.094),
 (10, 69335048668556.742),
 (11, 69523855215598.828),
 (12, 69049969587246.172),
 (13, 70011254508263.688),
 (14, 70908698869034.344),
 (15, 71106928385945.156)]

In [199]:
plt(result[0],result[1],'-')

TypeError: 'module' object is not callable

### 25. Quiz Question: What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set.

In [204]:
prediction = predict_output(8,norm_train,y_train,norm_test)
RSS = np.sum((prediction - y_test)**2)
RSS

133118823551516.81