In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [None]:
data_all = np.genfromtxt('../data/kc_house_data.csv', dtype=None, delimiter=',', names=True)
data = np.genfromtxt('../data/kc_house_data_small_train.csv', dtype=None, delimiter=',', names=True)
d_val = np.genfromtxt('../data/kc_house_data_validation.csv', dtype=None, delimiter=',', names=True)
d_test = np.genfromtxt('../data/kc_house_data_small_test.csv', dtype=None, delimiter=',', names=True)

In [None]:
def tmp_format_in_out(data):
#     tmp = np.empty((len(data['floors'])), dtype=np.float64)
    
#     for i in range(len(data['floors'])):
#         tmp[i] = np.float(data['floors'][i][1:-1])
#         # [1:-1] -> for some reason they are encoded as "[num]" and cannot 
#         # be converted with an obvious way otherwise.
    
    c1 = [np.ones_like(data['floors']), data['floors']]
    lf = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
          'waterfront', 'view', 'condition', 'grade', 'sqft_above',
          'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long',
          'sqft_living15', 'sqft_lot15']
    for ft in lf:
        d1 = data[ft]
        assert(not np.any(np.isnan(d1)))
        c1.append(d1)
    
    inp = np.array(c1).T
    output = data['price']
    
    return inp, output

def normalize_features(X):
    # copied form module 5.
    norms = np.linalg.norm(X, axis=0)
    X_normalized = X / norms
    return X_normalized, norms

In [None]:
# training data
inp, output = tmp_format_in_out(data)
inp_n, norms = normalize_features(inp)

# validation data
inp_v, output_v = tmp_format_in_out(d_val)
inp_n_v = inp_v / norms
# testing data
inp_t, output_t = tmp_format_in_out(d_test)
inp_n_t = inp_t / norms

In [None]:
# print(inp_n_t[0])
# print(inp_n[9])

In [None]:
# Euclidean norm of two samples as mentioned in the instructions
en = np.sqrt(np.sum((inp_n_t[0] - inp_n[9]) ** 2))
print(en)

# compute the euclidean distance of the test input with the first 10 houses
min_dist = 100
min_dist_h = -1
for i in range(10):
    en = np.sqrt(np.sum((inp_n_t[0] - inp_n[i]) ** 2))
    if en < min_dist:
        min_dist = en
        min_dist_h = i
assert(min_dist_h >= 0)
print(min_dist_h)

In [None]:
diff = inp_n - inp_n_t[0]
print (diff[-1]).sum()
print np.sum(inp_n[-1] - inp_n_t[0])

In [None]:
i = 100
diff = inp_n - inp_n_t[i]
distances = np.sqrt(np.sum(diff**2, axis=1))
print(distances[100])

In [None]:
def one_nearest_neighbor_regression(inp_n, inp_n_t_i):
    diff = inp_n - inp_n_t_i
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances

In [None]:
# apply the 1 nearest neighbor for 2nd test house
cc = one_nearest_neighbor_regression(inp_n, inp_n_t[2])
cc_arg = np.argsort(cc)
print('closest feats: {}, predicted value: {}'.format(cc_arg, output[cc_arg[0]]))

In [None]:
def k_nearest_neighbor_regression(inp_n, inp_n_t_i, k):
    assert(k > 0)
    diff = inp_n - inp_n_t_i
    distances = np.sqrt(np.sum(diff**2, axis=1))
    k_closest_idx = np.argsort(distances)[0 : k]
    return k_closest_idx, distances

In [None]:
def predict_with_k_nn_regresion(inp_n, inp_n_t_i, output, k, verbose=False):
    k_closest_idx, cc2 = k_nearest_neighbor_regression(inp_n, inp_n_t_i, k)
    if verbose:
        print(k_closest_idx)
    
    avg = 0
    for i in range(k):
        avg += output[k_closest_idx[i]]
    avg /= k
    return avg

In [None]:
pred_val_4 = predict_with_k_nn_regresion(inp_n, inp_n_t[2], output, 4, True)
print('predicted value for 4-knn: {}'.format(pred_val_4))

In [None]:
# predict the values of the first ten houses with k = 10
min_pred_value = 10e12  
idx = -1
for i in range(10):
    pred_val_10 = predict_with_k_nn_regresion(inp_n, inp_n_t[i], output, 10)
    if pred_val_10 < min_pred_value:
        idx = i
        min_pred_value = pred_val_10
assert(idx > 0)
print('idx = {}, min_pred_value: {}'.format(idx, min_pred_value))

# Choose k (validation)

In [None]:
def compute_rss(y_pred, y_true):
    erri = y_pred - y_true
    return np.sum(np.multiply(erri, erri))

In [None]:
rss_total = []
pool_k = range(1, 16)
for k in pool_k:
    print('K-NN regression with k={}'.format(k))
    pred_k_nn = np.empty((inp_n_v.shape[0]))
    for i in range(inp_n_v.shape[0]):
        pred_k_nn[i] = predict_with_k_nn_regresion(inp_n, inp_n_v[i], output, k)
    rss = compute_rss(pred_k_nn, output_v)
    rss_total.append(rss)
assert(len(rss_total) == len(pool_k))
idx = np.argsort(np.array(rss_total))
print('Min rss wth k={}'.format(pool_k[idx[0]]))

In [None]:
# test the model with min rss as found from the previous validation
k = pool_k[idx[0]]
pred_k_nn = np.empty((inp_n_t.shape[0]))
for i in range(inp_n_t.shape[0]):
    pred_k_nn[i] = predict_with_k_nn_regresion(inp_n, inp_n_t[i], output, k)
rss = compute_rss(pred_k_nn, output_t)
print('RSS on test data: {}'.format(rss))