In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from math import log, sqrt

In [8]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 
              'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict, parse_dates=['date'])
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict, 
                    parse_dates=['date'])
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict, 
                    parse_dates=['date'])
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict, 
                    parse_dates=['date'])

In [3]:
def get_X_y(df, features, output):
    X = np.c_[np.ones(df.shape[0]), df[features].values]
    y = df[output].values
    return X, y

def predict_output(feature_matrix, weights):
    pred = np.matmul(feature_matrix, weights.T)    
    return pred
    
def normalize_features(X):
    norms = np.linalg.norm(X, axis=0)
    X_normalized = X / norms
    return X_normalized, norms

In [13]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
'lat', 'long', 'sqft_living15', 'sqft_lot15']
len(features)

17

In [15]:
X_train, y_train = get_X_y(train, features, 'price')
X_validation, y_validation = get_X_y(validation, features, 'price')
X_test, y_test = get_X_y(test, features, 'price')

X_train, norms = normalize_features(X_train)
X_test = X_test / norms
X_validation = X_validation / norms

In [22]:
from scipy.spatial import distance
distance.euclidean(X_test[0], X_train[9])

0.059723593713980776

In [38]:
dists = []
for i in range(10):
    dists.append(distance.euclidean(X_test[0], X_train[i]))
np.argmin(dists)

8

In [44]:
diff = X_train - X_test[0]
diff[-1].sum()
distance = np.sqrt(np.sum(diff**2, axis=1))
distance[100]

0.023708232416678195

In [45]:
def compute_distances(features_instances, features_query):
    diff = features_instances - features_query
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances

In [47]:
tmp_dist = compute_distances(X_test[2], X_train)

In [49]:
np.argmin(tmp_dist)

382

In [51]:
y_train[np.argmin(tmp_dist)]

249000.0

In [70]:
def k_nearest_neighbors(k, features_train, features_query):
    distances = compute_distances(features_train, features_query)
    neighbors = np.argsort(distances)[:k]
    return neighbors

In [71]:
k_nearest_neighbors(4, X_train, X_test[2])

array([ 382, 1149, 4087, 3142])

In [72]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbors = k_nearest_neighbors(k, features_train, features_query)
    prediction = np.mean(output_train[neighbors])
    return prediction

In [73]:
predict_output_of_query(4, X_train, y_train, X_test[2])

413987.5

In [74]:
def predict_output(k, features_train, output_train, features_query):
    predictions = []
    for i in range(features_query.shape[0]):
        predictions.append(predict_output_of_query
                           (k, features_train, output_train, features_query[i]))
    return predictions

In [77]:
predict_output(10, X_train, y_train, X_test[:10])

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.70000000001,
 484000.0,
 457235.0]

In [78]:
rsses = []
for k in range(1, 16):
    tmp_predictions = predict_output(k, X_train, y_train, X_validation)
    rss = np.sum(np.square(tmp_predictions - y_validation))
    rsses.append(rss)

In [80]:
np.argmin(rsses)

7

In [82]:
tmp_predictions = predict_output(7, X_train, y_train, X_test)
print('{:e}'.format(np.sum(np.square(tmp_predictions - y_test))))

1.317571e+14
