# Nearest Neighbour

The purpose of this sectionn is to use nearest neighbour algorithm to predict house price. First, import and load source data.

In [193]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.decomposition import PCA


In [194]:
#load data and store in X, y
path = './archive'

kc_data = pd.read_csv(path + '/kc_preprocessed.csv')
kc_data = kc_data.drop(['Unnamed: 0'], axis = 1)
columns = kc_data.columns

y = kc_data['price']
X = kc_data.drop(['price'], axis = 1)

Split data into train, test, final test data by 60/20/20 ratio.

In [195]:
# split the train, test and final test data
X_train, X_final_test, y_train, y_final_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42) 


#-------------above: data clean and split--------------------------------------------

Then transform input data by standard normal distribution.

In [196]:
#standard normalization, using training data mean and sigma for test data as well for consistency with model
def X_norm(X):   
        mean = np.average(X_train, axis=0)
        sigma = np.var(X_train, axis=0)**0.5
        X_new = (X - mean )/sigma
        return X_new
    
def y_norm(y):
        mean = np.average(y_train, axis=0)
        sigma = np.var(y_train, axis=0)**0.5
        y_new = (y - mean )/sigma
        return y_new

In [197]:
X_train_norm = X_norm(X_train)
X_test_norm = X_norm(X_test)
y_train_norm = y_norm(y_train)
y_test_norm = y_norm(y_test)
print(X_train_norm)
print(X_train_norm.iloc[0])
print(X_train_norm.loc[3086])

           date  bedrooms  bathrooms  sqft_living  sqft_lot    floors  \
295   -0.988310  0.700497   0.498111     1.400390  0.167790  0.918016   
3086   1.603723 -1.527280  -1.454672    -1.050993 -0.238216 -0.923555   
6962  -1.430636 -0.413391   1.474502     1.761531 -0.191261  0.918016   
4249  -0.298281  0.700497   0.823574     1.214347 -0.234412  0.918016   
8903  -0.802533  0.700497   0.498111     0.853206 -0.230366  0.918016   
...         ...       ...        ...          ...       ...       ...   
15611  1.462178  0.700497  -0.478281    -0.438147  0.102630 -0.923555   
20027  0.621758  1.814386   1.799966     2.013236 -0.240624  0.918016   
6189  -0.448672  1.814386   0.823574     1.400390 -0.143102  0.918016   
5106  -0.059424 -0.413391  -0.478281    -0.569471 -0.190177 -0.002770   
20857 -0.254048 -1.527280  -0.803744    -0.700796 -0.347176  0.918016   

       waterfront      view  condition     grade  sqft_above  yr_built  \
295     -0.086819 -0.307305  -0.621478  1.992210 

Then fit to nearest neighbour model. Select KD tree algorithm for the model as it is good for large number of samples with dimension < 20.

In [198]:
for k in range(1,21):
    neigh = KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='kd_tree')
    neigh.fit(X_train_norm, y_train)
    print("R2 for k="+str(k)+": "+str(neigh.score(X_test_norm, y_test)))

R2 for k=1: 0.6901725527613745
R2 for k=2: 0.7552395717608498
R2 for k=3: 0.7693189773944258
R2 for k=4: 0.7759246717797785
R2 for k=5: 0.7803945007115815
R2 for k=6: 0.7839421589933107
R2 for k=7: 0.7866082334798119
R2 for k=8: 0.7845751254923023
R2 for k=9: 0.7846116691054192
R2 for k=10: 0.7854834641973512
R2 for k=11: 0.7850799431953828
R2 for k=12: 0.7828208559838777
R2 for k=13: 0.7827660117381431
R2 for k=14: 0.7819475686496142
R2 for k=15: 0.7791680562322523
R2 for k=16: 0.7783465335627127
R2 for k=17: 0.7768733453369342
R2 for k=18: 0.7752940156055215
R2 for k=19: 0.7742979435658556
R2 for k=20: 0.7738430762965935


From above, k = 10 produces the best score when all features are included.

# First Trial Combining with Linear Regression

In [199]:
X_test_neigh = neigh.kneighbors(X_test_norm, return_distance=False)
print(np.max(X_test_neigh)) #double check it is the max index 12966-1

12965


Above line is to get the neighbour points. What if we combine and use linear regression for neighbour points instead of just an average over these points? Let's have first trial using 20 neighbours of the first test point.

In [200]:
X_in = X_train_norm.iloc[X_test_neigh[0]]
y_in = y_train_norm.iloc[X_test_neigh[0]]
reg=LinearRegression().fit(X_in, y_in)
print("intercept: "+str(reg.intercept_)+"; coef: "+str(reg.coef_))

print('y_norm predicted by KNN alone:'+str(y_norm(neigh.predict(X_test_norm.iloc[0].to_numpy().reshape((1,-1))))))
print('y_norm predicted by linear + KNN: '+str(reg.predict(X_test_norm.iloc[0].to_numpy().reshape((1,-1)))))
print('y_norm actual:'+str(y_test_norm.iloc[0]))

intercept: 0.20313282329456933; coef: [ 4.61245431e-02 -1.17885819e-01 -5.04784083e-02  3.41904828e-01
 -7.54991813e-01 -7.09857436e-02  1.16573418e-15 -3.88578059e-16
 -9.03429331e-02  3.33069979e-02 -1.49710253e-01  2.30326492e-01
  2.35264825e-01 -1.12263323e-02  3.17616496e-01 -2.81281856e-01
 -1.57877913e-01  8.00451797e-01]
y_norm predicted by KNN alone:[-0.47087935]
y_norm predicted by linear + KNN: [-0.81977132]
y_norm actual:-0.7531056036958254




In [201]:
reg_train=LinearRegression().fit(X_train_norm, stats.zscore(y_train))
print("intercept: "+str(reg_train.intercept_)+"; coef: "+str(reg_train.coef_))

intercept: -1.23103168411376e-14; coef: [ 0.03728965 -0.09960807  0.10405474  0.40608736  0.00487039  0.01636217
  0.14432443  0.10642428  0.04362966  0.29500055  0.05261841 -0.28107073
  0.05530448 -0.08098474  0.22564524 -0.06738594  0.0305894  -0.02123068]


The frist trial (using the first test record) of combining with linear regreesion has better performance than KNN alone!

Let's examine further. Small coefficients may imply insignificance of corresponding variables. As nearest neighbour method does not give different weight to each dimension, including feature that is not important may hurt the influence of other important features and overall performance. Try again by dropping sqft_lot, floors, yr_renovated, sqft_living15, sqft_lot15, or try each one generally

In [202]:
#X_train_drop = X_train.drop(['sqft_lot','floors','yr_renovated','sqft_living15','sqft_lot15'], axis=1)
#X_test_drop = X_test.drop(['sqft_lot','floors','yr_renovated','sqft_living15','sqft_lot15'], axis=1)

max_score={}
max_score_k={}
for feature in X_train.columns:  
    max_score[feature]=0
    max_score_k[feature]=0
    for k in range(1,21):
        X_train_drop = X_train.drop([feature], axis=1)
        X_test_drop = X_test.drop([feature], axis=1)
        neigh = KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='kd_tree')
        neigh.fit(stats.zscore(X_train_drop), y_train)
        temp = neigh.score(stats.zscore(X_test_drop), y_test)
        if temp > max_score[feature]:
            max_score[feature]=temp
            max_score_k[feature]=k
    print("R2 for k="+str(max_score_k[feature])+", drop "+feature+": "+str(max_score[feature]))

R2 for k=5, drop date: 0.7978495996251538
R2 for k=10, drop bedrooms: 0.796076051095866
R2 for k=9, drop bathrooms: 0.7972616504005778
R2 for k=7, drop sqft_living: 0.7774798069400735
R2 for k=8, drop sqft_lot: 0.7897353815265381
R2 for k=8, drop floors: 0.794808184876425
R2 for k=13, drop waterfront: 0.780622613063982
R2 for k=6, drop view: 0.8018300845945656
R2 for k=9, drop condition: 0.794459191216299
R2 for k=10, drop grade: 0.7664373373593112
R2 for k=6, drop sqft_above: 0.7883099714806779
R2 for k=7, drop yr_built: 0.787043954983182
R2 for k=7, drop yr_renovated: 0.7880850410562545
R2 for k=9, drop zipcode: 0.7838195605558239
R2 for k=9, drop lat: 0.7215526717465148
R2 for k=9, drop long: 0.7676557193865654
R2 for k=9, drop sqft_living15: 0.7828951851314768
R2 for k=8, drop sqft_lot15: 0.7915952573328573


From the above, best performace occurs around k=9, and generally droping one feature may not improve the score much, and it may hurt performance for dropping some important feature, notably such as lat, long, sqft_living, grade. Let's see what if only lat, long, sqft_living, grade are included?

In [203]:
X_train_drop = X_train[['lat','long']]
X_test_drop = X_test[['lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.5383854974562687


In [204]:
X_train_drop = X_train[['sqft_living','lat','long']]
X_test_drop = X_test[['sqft_living','lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.7863773086534392


In [205]:
X_train_drop = X_train[['sqft_living','grade','lat','long']]
X_test_drop = X_test[['sqft_living','grade','lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.81609345393923


We should note that only lat, long, sqft_living already gives the same performance vs all the features! 

And it may help if only add some more features selectively.

In [206]:
print("R2 score after adding one feature at a time to sqft_living, lat, long")
print("----------------")
for feature in set(X_train.columns) - {'sqft_living','lat','long'}:
    X_train_drop = X_train[['sqft_living','lat','long',feature]]
    X_test_drop = X_test[['sqft_living','lat', 'long',feature]]
    neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
    neigh.fit(stats.zscore(X_train_drop), y_train)
    print(feature+": "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

R2 score after adding one feature at a time to sqft_living, lat, long
----------------
sqft_lot: 0.8021303644257914
yr_renovated: 0.7971630178396623
zipcode: 0.7849425736206364
sqft_lot15: 0.7992307690090501
floors: 0.7855792138437818
bathrooms: 0.7674644490826882
bedrooms: 0.7790101739960544
sqft_above: 0.7843465828076341
yr_built: 0.796953104249974
condition: 0.7803071710924393
grade: 0.81609345393923
sqft_living15: 0.8010770074400841
date: 0.7685053526953152
waterfront: 0.82151286900326
view: 0.8048157669161944


Unexpected, sparsely variable waterfront is the best! Let's further add selectively according to the above scores.

In [207]:
try_feature = ['sqft_living','lat','long','waterfront', 'grade']
X_train_drop = X_train[try_feature]
X_test_drop = X_test[try_feature]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(str(try_feature)+" R2 score: "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

['sqft_living', 'lat', 'long', 'waterfront', 'grade'] R2 score: 0.8482072082067893


In [208]:
try_feature = ['sqft_living','lat','long','waterfront', 'grade','view']
X_train_drop = X_train[try_feature]
X_test_drop = X_test[try_feature]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(str(try_feature)+" R2 score: "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

['sqft_living', 'lat', 'long', 'waterfront', 'grade', 'view'] R2 score: 0.8461563078069392


In [209]:
try_feature = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
X_train_drop = X_train[try_feature]
X_test_drop = X_test[try_feature]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(str(try_feature)+" R2 score: "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

['sqft_living', 'lat', 'long', 'waterfront', 'grade', 'sqft_lot'] R2 score: 0.8551000169819332


In [210]:
def metrics(y_predict, y_true):  #given predicted and true values, return dictionary of R2, RMSE, RMSE/mean, average abs % error
    metrics={}
    residual = y_predict - y_true
    average = np.average(y_true)
    abs_percent_error = np.absolute(np.divide(residual,y_true))*100
    metrics['R2'] = 1 - (np.sum(residual**2)/len(residual))/np.var(y_true)
    metrics['RMSE']= (np.sum(residual**2)/len(residual))**0.5
    metrics['RMSE/mean'] = metrics['RMSE'] / average
    metrics['average absolute % error'] = np.average(abs_percent_error)
    metrics['median absolute % error'] = np.median(abs_percent_error)
    return metrics


## The best features are ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
The median absolute % error is below 10%.

In [211]:
metrics(neigh.predict(X_test_norm[['sqft_living','lat','long','waterfront', 'grade','sqft_lot']]), y_test)

{'R2': 0.8538285027338658,
 'RMSE': 135207.70547762708,
 'RMSE/mean': 0.2527154921702753,
 'average absolute % error': 14.554465885766795,
 'median absolute % error': 9.79801363924772}

# Trying combination with Linear Regression again

In [212]:
y_linear_knn =[]
knn_features = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
linear_features = ['sqft_living','grade','sqft_lot'] #exclude map-related info
X_test_neigh = neigh.kneighbors(X_test_norm[knn_features], return_distance=False)

for i in range(len(X_test)):
    X_in = X_train_norm[linear_features].iloc[X_test_neigh[i]]
    y_in = y_train_norm.iloc[X_test_neigh[i]]
    reg=LinearRegression().fit(X_in, y_in)
    y_linear_knn.append(reg.predict(X_test_norm[linear_features].iloc[i].to_numpy().reshape((1,-1))))























































In [213]:
y_linear_knn_unnorm = np.squeeze(y_linear_knn)* (np.var(y_train)**0.5) + np.average(y_train)
metrics(y_linear_knn_unnorm, y_test)

{'R2': 0.7903768665118113,
 'RMSE': 161915.9749625127,
 'RMSE/mean': 0.3026356756690336,
 'average absolute % error': 16.258761249036244,
 'median absolute % error': 10.787623591335468}

In [214]:
y_linear_knn =[]
knn_features = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
linear_features = ['sqft_living','grade'] #exclude map-related info and further exclude 'sqft_lot' which is less important
X_test_neigh = neigh.kneighbors(X_test_norm[knn_features], return_distance=False)

for i in range(len(X_test)):
    X_in = X_train_norm[linear_features].iloc[X_test_neigh[i]]
    y_in = y_train_norm.iloc[X_test_neigh[i]]
    reg=LinearRegression().fit(X_in, y_in)
    y_linear_knn.append(reg.predict(X_test_norm[linear_features].iloc[i].to_numpy().reshape((1,-1))))





















































In [215]:
y_linear_knn_unnorm = np.squeeze(y_linear_knn)* (np.var(y_train)**0.5) + np.average(y_train)
metrics(y_linear_knn_unnorm, y_test)

{'R2': 0.8177654787693934,
 'RMSE': 150968.18020041625,
 'RMSE/mean': 0.2821732520837138,
 'average absolute % error': 15.177657259421196,
 'median absolute % error': 10.44626825595106}

In [216]:
y_linear_knn =[]
knn_features = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
linear_features = ['sqft_living'] #test by only one feature
X_test_neigh = neigh.kneighbors(X_test_norm[knn_features], return_distance=False)

for i in range(len(X_test)):
    X_in = X_train_norm[linear_features].iloc[X_test_neigh[i]]
    y_in = y_train_norm.iloc[X_test_neigh[i]]
    reg=LinearRegression().fit(X_in, y_in)
    y_linear_knn.append(reg.predict(X_test_norm[linear_features].iloc[i].to_numpy().reshape((1,-1))))







































































































In [217]:
y_linear_knn_unnorm = np.squeeze(y_linear_knn)* (np.var(y_train)**0.5) + np.average(y_train)
metrics(y_linear_knn_unnorm, y_test)

{'R2': 0.8353706373758195,
 'RMSE': 143490.696396135,
 'RMSE/mean': 0.26819715513628883,
 'average absolute % error': 15.086716188353362,
 'median absolute % error': 10.460427900770604}

Hence, while combining linear regression with KNN improves the performance before the KNN is tuned by selectively dropping features, it doesn't improve the performance when we have chosen the optimal combination of features for KNN. The reason is that linear regression itself also suffers from high dimensionality, and KNN can produce average weighted (inversely) by distance resulting from selectively important features, so combining the linear regression doesn't help improve the performance.

# Nearest Neighbour by Fixed Radius

Next we try to explore if we can further imporve the performance by fixed radius. The rationale is to exclude samples that are not quite relevant.

In [218]:
neigh_r = RadiusNeighborsRegressor(radius=0.1)
neigh_r.fit(X_train_norm[['sqft_living','lat','long','waterfront', 'grade','sqft_lot']], y_train)
metrics(neigh_r.predict(X_test_norm[['sqft_living','lat','long','waterfront', 'grade','sqft_lot']]), y_test)



{'R2': 0.9655371586439017,
 'RMSE': 65651.64846282924,
 'RMSE/mean': 0.12270889883431343,
 'average absolute % error': nan,
 'median absolute % error': nan}

R2 improves a lot, but % error cannot be computed due to some points having no neighbours to calculate - this may overstate the performance as some outliers are excluded inappropriately. So let's try larger radius.

In [219]:
neigh_r = RadiusNeighborsRegressor(radius=0.5)
neigh_r.fit(X_train_norm[['sqft_living','lat','long','waterfront', 'grade','sqft_lot']], y_train)
metrics(neigh_r.predict(X_test_norm[['sqft_living','lat','long','waterfront', 'grade','sqft_lot']]), y_test)



{'R2': 0.8583537792694218,
 'RMSE': 133098.3252092217,
 'RMSE/mean': 0.2487728687020252,
 'average absolute % error': nan,
 'median absolute % error': nan}

Now R2 is similar to tuned KNN but still there are some points having no neighbours. 
So, it seems not quite practical to achieve better performance for all points with defined fixed radius, at least in our case.

# Reframing the target question

Recall that when we have only lat, long and sqft_living as features for KNN, the performance is the same as having the all set of features. This inspires us to have an even more compact structure of KNN - using lat, long only, but with price incorporated into target, i.e. price per sqft_living. This is also more intuitive - it becomes real nearest neighbour finding. The predicted price per sqft_living is then multiplied by input sqft_living to get predicted price. Then we try to explore tuning the new KNN by similar approach as above. 

In [220]:
y_= y/X['sqft_living']
y_train_=y_train/X_train['sqft_living']
print('max sqft price: '+str(np.max(y_))+'; min sqft price: '+str(np.min(y_)))
print('max sqft price normalized: '+str(np.max(stats.zscore(y_)))+'; min sqft price normalized: '+str(np.min(stats.zscore(y_))))


max sqft price: 810.1388888888889; min sqft price: 87.58823529411765
max sqft price normalized: 4.960951208658341; min sqft price normalized: -1.6042777444527474


For fair comparison, we keep using k=9

In [221]:
neigh_ = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh_.fit(stats.zscore(X_train[['lat','long']]), y_train_)
metrics(neigh_.predict(stats.zscore(X_test[['lat','long']]))*X_test['sqft_living'],y_test)

{'R2': 0.7758846953963403,
 'RMSE': 167419.4274917466,
 'RMSE/mean': 0.31292212872027064,
 'average absolute % error': 20.13169440818158,
 'median absolute % error': 14.912354526709768}

The result is much better than orginal KNN with lat long only, but similar to the KNN with lat long and sqft_living.
Let's try the same optimal features.

In [222]:
neigh_ = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh_.fit(stats.zscore(X_train[['lat','long','waterfront', 'grade','sqft_lot','sqft_living']]), y_train_)
metrics(neigh_.predict(stats.zscore(X_test[['lat','long','waterfront', 'grade','sqft_lot','sqft_living']]))*X_test['sqft_living'],y_test)

{'R2': 0.8488619687498906,
 'RMSE': 137485.52465535633,
 'RMSE/mean': 0.25697294327146264,
 'average absolute % error': 14.48540751742097,
 'median absolute % error': 10.156478044485052}

The performance vs original tuned KNN (R2: 85.38%, average % error: 14.55%, median % error: 9.80% ) is almost the same.

Hence, we will keep using the original tuned KNN for simplicity.

# PCA and KNN

As KNN weights each dimension equally and perform poorly with too many dimensions, we may explore using PCA to reduce dimensionality and combine with KNN.

First, excluding selected optimal features:

In [259]:
X_train_norm_drop6 = X_train_norm.drop(['lat','long','waterfront', 'grade','sqft_lot','sqft_living'],axis=1)
X_test_norm_drop6 = X_test_norm.drop(['lat','long','waterfront', 'grade','sqft_lot','sqft_living'],axis=1)
X_train_norm_drop6

Unnamed: 0,date,bedrooms,bathrooms,floors,view,condition,sqft_above,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15
295,-0.988310,0.700497,0.498111,0.918016,-0.307305,-0.621478,1.905406,0.774262,0.704702,-0.051389,1.706462,-0.052935
3086,1.603723,-1.527280,-1.454672,-0.923555,-0.307305,-0.621478,-1.019159,-1.129306,-1.239679,2.276914,-0.699568,-0.262784
6962,-1.430636,-0.413391,1.474502,0.918016,-0.307305,-0.621478,2.305865,1.216161,1.156076,-0.840008,-0.156744,-0.195964
4249,-0.298281,0.700497,0.823574,0.918016,-0.307305,-0.621478,1.699109,1.148177,1.086634,-0.051389,1.721133,-0.244863
8903,-0.802533,0.700497,0.498111,0.918016,-0.307305,-0.621478,1.298650,1.114184,1.051913,-0.933891,0.444763,-0.077162
...,...,...,...,...,...,...,...,...,...,...,...,...
15611,1.462178,0.700497,-0.478281,-0.923555,-0.307305,0.914859,-0.278917,-0.415468,-0.510536,-0.426922,0.019307,-0.000644
20027,0.621758,1.814386,1.799966,0.918016,-0.307305,-0.621478,1.250109,1.114184,1.051913,-0.483251,1.735804,-0.253190
6189,-0.448672,1.814386,0.823574,0.918016,-0.307305,-0.621478,1.905406,1.454107,1.399124,-1.253094,2.014552,-0.109852
5106,-0.059424,-0.413391,-0.478281,-0.002770,-0.307305,-0.621478,-0.278917,-0.313491,0.392212,-0.483251,0.342067,-0.151349


In [254]:
pca = PCA(n_components=12)
pca.fit(X_train_norm_drop6)
print(pca.components_)

[[-0.01966195  0.25040818  0.40780114  0.3267158   0.06847438 -0.16883551
   0.41035035  0.38007186  0.38890082 -0.19720867  0.34824495  0.08011441]
 [-0.05638414  0.36648186  0.17000351 -0.16338849  0.39770305  0.42014431
   0.23841715 -0.37645719 -0.36771866  0.03704205  0.3418368   0.14220286]
 [ 0.00862925 -0.04185226 -0.13223777 -0.31884831 -0.30398956  0.21409766
  -0.02446198  0.08325407  0.03509205 -0.66644721  0.0465462   0.53450527]
 [ 0.6022394  -0.29086804 -0.08843417 -0.03458636  0.43775032 -0.30390852
   0.00640404 -0.00191453  0.01594748  0.15186583  0.09898406  0.47596309]
 [-0.77978605 -0.33370683 -0.06832302  0.03563028  0.33569878 -0.09930832
  -0.03904522  0.05289598  0.0764382   0.12133618  0.01804092  0.35618373]
 [-0.05307125  0.25703965  0.05330807  0.20785099 -0.54555981 -0.19208234
   0.16163185 -0.18861076 -0.16857829  0.42844712 -0.10350618  0.51588872]
 [ 0.14980149 -0.25386982  0.12978671  0.43210275  0.00986202  0.74446349
  -0.03466046  0.16002887  0.132

In [255]:
print(pca.explained_variance_ratio_)

[0.33986571 0.13624469 0.09528488 0.08677608 0.08188697 0.06680403
 0.05783427 0.04838149 0.03832667 0.02436951 0.01662061 0.00760507]


In [260]:
pca0 = pd.Series(np.dot(X_train_norm_drop6, pca.components_[0]))
pca0_test = pd.Series(np.dot(X_test_norm_drop6, pca.components_[0]))

In [262]:
X_pca = X_train_norm[['lat','long','waterfront', 'grade','sqft_lot','sqft_living']].assign(pca0=pca0.values)
X_pca_test = X_test_norm[['lat','long','waterfront', 'grade','sqft_lot','sqft_living']].assign(pca0=pca0_test.values)

In [258]:
X_pca

Unnamed: 0,lat,long,waterfront,grade,sqft_lot,sqft_living,pca0
295,0.251687,1.388670,-0.086819,1.992210,0.167790,1.400390,2.732166
3086,0.645342,-1.266843,-0.086819,-0.563543,-0.238216,-1.050993,-3.268290
6962,0.875757,0.092779,-0.086819,2.844128,-0.191261,1.761531,2.863144
4249,0.217017,1.594029,-0.086819,1.140292,-0.234412,1.214347,3.047051
8903,1.334419,-0.077174,-0.086819,0.288375,-0.230366,0.853206,2.476473
...,...,...,...,...,...,...,...
15611,-0.758814,0.078617,-0.086819,-0.563543,0.102630,-0.438147,-0.905670
20027,0.747909,0.914218,-0.086819,1.140292,-0.240624,2.013236,3.585000
6189,1.553999,0.283976,-0.086819,1.140292,-0.143102,1.400390,4.001376
5106,0.914039,0.666370,-0.086819,-0.563543,-0.190177,-0.569471,-0.093183


In [265]:
neigh_pca = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh_pca.fit(X_pca, y_train)
metrics(neigh_pca.predict(X_pca_test), y_test)

{'R2': 0.8295529785609571,
 'RMSE': 146004.01647730486,
 'RMSE/mean': 0.272894778833478,
 'average absolute % error': 15.425847525433278,
 'median absolute % error': 10.515899951948736}

Even by adding the first principal component cannot improve the performance of KNN, so we don't need to explore other components.

# Conclusion



In [279]:
optimal_feature = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
for k in range(1,21):
    neigh = KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='kd_tree')
    neigh.fit(X_train_norm[optimal_feature], y_train)
    print("R2 for k="+str(k)+": "+str(neigh.score(X_test_norm[optimal_feature], y_test)))

R2 for k=1: 0.7923473407221475
R2 for k=2: 0.8351576149005633
R2 for k=3: 0.8442640091002146
R2 for k=4: 0.8472058057180906
R2 for k=5: 0.8512656390163132
R2 for k=6: 0.8547685242563368
R2 for k=7: 0.8533181584408867
R2 for k=8: 0.8526769674876146
R2 for k=9: 0.8538285027338658
R2 for k=10: 0.8530367348432438
R2 for k=11: 0.8530565016511261
R2 for k=12: 0.8527873566244337
R2 for k=13: 0.8519175976311263
R2 for k=14: 0.8523686141959868
R2 for k=15: 0.8514197326034112
R2 for k=16: 0.8507779701000642
R2 for k=17: 0.8506875449502542
R2 for k=18: 0.850641048264928
R2 for k=19: 0.8498594899969905
R2 for k=20: 0.8493249959978584


In [280]:
neigh = KNeighborsRegressor(n_neighbors=6, weights='distance', algorithm='kd_tree')
neigh.fit(X_train_norm[optimal_feature], y_train)
metrics(neigh.predict(X_test_norm[optimal_feature]),y_test)

{'R2': 0.8547685242563368,
 'RMSE': 134772.24728061008,
 'RMSE/mean': 0.2519015812160883,
 'average absolute % error': 14.63256595739251,
 'median absolute % error': 10.16644789507823}

In [281]:
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(X_train_norm[optimal_feature], y_train)
metrics(neigh.predict(X_test_norm[optimal_feature]),y_test)

{'R2': 0.8538285027338658,
 'RMSE': 135207.70547762708,
 'RMSE/mean': 0.2527154921702753,
 'average absolute % error': 14.554465885766795,
 'median absolute % error': 9.79801363924772}

By checking other k values again, we find that k=6 is marginally surpassing k=9 by R2 score. After further checking %error, we confirm that 

## optimal KNN occurs at k=9, for optimal features: ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']