In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
PATH = 'data/'

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [5]:
def get_numpy_data(df, features, output):
    df['constant'] = 1
    features = ['constant'] + features
    features_frame = df[features]
    output_series = df[output]
    feature_matrix = features_frame.to_numpy()
    output_array = output_series.to_numpy()
    return feature_matrix, output_array

In [6]:
def normalize_features(features_matrix):
    norms = np.linalg.norm(features_matrix,axis=0)
    normalized_features = features_matrix / norms
    return normalized_features,norms

In [7]:
train = pd.read_csv(f'{PATH}kc_house_data_small_train.csv',dtype=dtype_dict)
validation = pd.read_csv(f'{PATH}kc_house_data_small_validation.csv',dtype=dtype_dict)
test = pd.read_csv(f'{PATH}kc_house_data_small_test.csv',dtype=dtype_dict)

In [8]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']

In [9]:
train_feature_matrix, train_output_array = get_numpy_data(train,feature_list,'price')
validation_feature_matrix, validation_output_array = get_numpy_data(validation,feature_list,'price')
test_feature_matrix, test_output_array = get_numpy_data(test,feature_list,'price')

In [10]:
train_noramalized_features,train_norms = normalize_features(train_feature_matrix)

In [11]:
validation_normalized_features = validation_feature_matrix / train_norms
test_normalized_features = test_feature_matrix / train_norms

In [14]:
print(test_normalized_features[0])
print(train_noramalized_features[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [23]:
def euclidean_distance(x,y): return np.sqrt(np.sum((x-y)**2))

In [24]:
euclidean_distance(test_normalized_features[0],train_noramalized_features[9])

0.05972359371398078

In [28]:
for i in range(10):
    print(euclidean_distance(test_normalized_features[0],train_noramalized_features[i]), '--', i+1)

0.06027470916295592 -- 1
0.08546881147643746 -- 2
0.06149946435279315 -- 3
0.05340273979294363 -- 4
0.05844484060170442 -- 5
0.059879215098128345 -- 6
0.05463140496775461 -- 7
0.055431083236146074 -- 8
0.052383627840220305 -- 9
0.05972359371398078 -- 10


In [29]:
for i in range(3):
    print(train_noramalized_features[i]-test_normalized_features[0])
    # should print 3 vectors of length 18

[ 0.00000000e+00 -3.87821276e-03 -1.20498190e-02 -1.05552733e-02
  2.08673616e-04 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03 -5.50336860e-03 -2.48168183e-02
 -1.63756198e-04  0.00000000e+00 -1.70254220e-05  1.29876855e-05
 -5.14364795e-03  6.69281453e-04]
[ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
  7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
 -1.91048898e-04  6.65082271e-02  4.23090220e-05  6.16364736e-06
 -2.89330197e-03  1.47606982e-03]
[ 0.00000000e+00 -7.75642553e-03 -1.20498190e-02 -1.30002801e-02
  1.60518166e-03 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -5.21450589e-03 -8.32384500e-03 -2.48168183e-02
 -3.13866046e-04  0.00000000e+00  4.70885840e-05  1.56292487e-05
  3.72914476e-03  1.64764925e-03]


In [30]:
print(train_noramalized_features[0:3]-test_normalized_features[0])

[[ 0.00000000e+00 -3.87821276e-03 -1.20498190e-02 -1.05552733e-02
   2.08673616e-04 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -3.47633726e-03 -5.50336860e-03 -2.48168183e-02
  -1.63756198e-04  0.00000000e+00 -1.70254220e-05  1.29876855e-05
  -5.14364795e-03  6.69281453e-04]
 [ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
   7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
  -1.91048898e-04  6.65082271e-02  4.23090220e-05  6.16364736e-06
  -2.89330197e-03  1.47606982e-03]
 [ 0.00000000e+00 -7.75642553e-03 -1.20498190e-02 -1.30002801e-02
   1.60518166e-03 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -5.21450589e-03 -8.32384500e-03 -2.48168183e-02
  -3.13866046e-04  0.00000000e+00  4.70885840e-05  1.56292487e-05
   3.72914476e-03  1.64764925e-03]]


In [35]:
diff = train_noramalized_features - test_normalized_features[0]

In [36]:
diff[-1].sum()

-0.09343399874654643

In [43]:
np.sum(diff**2,axis=1)[15]

0.0033070590284564457

In [45]:
np.sum(diff[15]**2)

0.0033070590284564453

In [46]:
np.sum(diff**2,axis=1)

array([0.00363304, 0.00730492, 0.00378218, ..., 0.0032681 , 0.00325555,
       0.00341846])

In [48]:
distances = np.sqrt(np.sum(diff**2,axis=1))

In [49]:
distances[100]

0.023708232416678195

In [50]:
def compute_distances(features_instances, features_query):
    diff = features_instances - features_query
    distances = np.sqrt(np.sum(diff**2,axis=1))
    return distances

In [52]:
distances = compute_distances(train_noramalized_features, test_normalized_features[2])

In [53]:
distances

array([0.01954476, 0.06861035, 0.02165079, ..., 0.02433478, 0.02622734,
       0.02637942])

In [57]:
np.argsort(distances)

array([ 382, 1149, 4087, ..., 1107, 5226, 2486], dtype=int64)

In [61]:
train_output_array[np.argsort(distances)]

array([ 249000.,  477000.,  499950., ...,  790000., 1600000.,  937500.])

In [54]:
np.argmin(distances)

382

In [56]:
train_output_array[382]

249000.0

In [58]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = compute_distances(feature_train,features_query)
    neighbors = np.argsort(distances)
    return neighbors[:k]

In [60]:
k_nearest_neighbors(4,train_noramalized_features,test_normalized_features[2])

array([ 382, 1149, 4087, 3142], dtype=int64)

In [62]:
def predict_output_of_query(k, features_train, output_train, features_query):
    prediction = k_nearest_neighbors(k,features_train,features_query)
    prediction = output_train[prediction]
    return prediction.mean()

In [63]:
predict_output_of_query(4,train_noramalized_features,train_output_array,test_normalized_features[2])

413987.5

In [73]:
def predict_output(k, features_train, output_train, features_query):
    predictions = []
    for i in range(len(features_query)):
        predictions.append(predict_output_of_query(k,features_train,output_train,features_query[i]))
    return predictions

In [74]:
predict_output(10,train_noramalized_features, train_output_array, test_normalized_features[:10,:])

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.7,
 484000.0,
 457235.0]

In [75]:
def rss(x,y): return np.sum((x-y)**2)

In [83]:
for k in [i+1 for i in range(15)]:
    predictions_validation = predict_output(k,train_noramalized_features, train_output_array, validation_normalized_features)
    print("{:e}".format(rss(predictions_validation,validation_output_array)),'---',k)
    print('-------------')

1.054538e+14 --- 1
-------------
8.344507e+13 --- 2
-------------
7.269210e+13 --- 3
-------------
7.194672e+13 --- 4
-------------
6.984652e+13 --- 5
-------------
6.889954e+13 --- 6
-------------
6.834197e+13 --- 7
-------------
6.736168e+13 --- 8
-------------
6.837273e+13 --- 9
-------------
6.933505e+13 --- 10
-------------
6.952386e+13 --- 11
-------------
6.904997e+13 --- 12
-------------
7.001125e+13 --- 13
-------------
7.090870e+13 --- 14
-------------
7.110693e+13 --- 15
-------------


In [79]:
predictions_test = predict_output(8,train_noramalized_features, train_output_array, test_normalized_features)

In [81]:
print("{:e}".format(rss(predictions_test,test_output_array)))

1.331188e+14
