# Nearest Neighbour

The purpose of this sectionn is to use nearest neighbour algorithm to predict house price. First, import and load source data.

In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor

path = './archive/'

kc_data = pd.read_csv(path + '/kc_house_data.csv')

Clean data. Remove obviously wrong record with 33 bedrooms, detected in "Outlier checking.ipynb". Then split data into train, test, final test data by 60/20/20 ratio.

In [42]:
data = kc_data.drop(['id','date'],axis = 1)
#remove obviously wrong record with 33 bedrooms, detected in "Outlier checking.ipynb"
data = data.drop(labels=15870, axis=0) 

y = data['price']
X = data.drop('price', axis=1)

# split the train, test and final test data
X_train, X_final_test, y_train, y_final_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42) 


#-------------above: data clean and split--------------------------------------------

Then transform input data by standard normal distribution.

In [43]:
#standard normalization, using training data mean and sigma for test data as well for consistency with model
def X_norm(X):   
        mean = np.average(X_train, axis=0)
        sigma = np.var(X_train, axis=0)**0.5
        X_new = (X - mean )/sigma
        return X_new
    
def y_norm(y):
        mean = np.average(y_train, axis=0)
        sigma = np.var(y_train, axis=0)**0.5
        y_new = (y - mean )/sigma
        return y_new

In [44]:
X_train_norm = X_norm(X_train)
X_test_norm = X_norm(X_test)
y_train_norm = y_norm(y_train)
y_test_norm = y_norm(y_test)
print(X_train_norm)
print(X_train_norm.iloc[0])
print(X_train_norm.loc[3086])

       bedrooms  bathrooms  sqft_living  sqft_lot    floors  waterfront  \
295    0.700497   0.498111     1.400390  0.167790  0.918016   -0.086819   
3086  -1.527280  -1.454672    -1.050993 -0.238216 -0.923555   -0.086819   
6962  -0.413391   1.474502     1.761531 -0.191261  0.918016   -0.086819   
4249   0.700497   0.823574     1.214347 -0.234412  0.918016   -0.086819   
8903   0.700497   0.498111     0.853206 -0.230366  0.918016   -0.086819   
...         ...        ...          ...       ...       ...         ...   
15611  0.700497  -0.478281    -0.438147  0.102630 -0.923555   -0.086819   
20028  1.814386   1.799966     2.013236 -0.240624  0.918016   -0.086819   
6189   1.814386   0.823574     1.400390 -0.143102  0.918016   -0.086819   
5106  -0.413391  -0.478281    -0.569471 -0.190177 -0.002770   -0.086819   
20858 -1.527280  -0.803744    -0.700796 -0.347176  0.918016   -0.086819   

           view  condition     grade  sqft_above  sqft_basement  yr_built  \
295   -0.307305  -0.62

Then fit to nearest neighbour model. Select KD tree algorithm for the model as it is good for large number of samples with dimension < 20.

In [45]:
for k in range(1,21):
    neigh = KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='kd_tree')
    neigh.fit(X_train_norm, y_train)
    print("R2 for k="+str(k)+": "+str(neigh.score(X_test_norm, y_test)))

R2 for k=1: 0.7036034876524693
R2 for k=2: 0.7541954230078757
R2 for k=3: 0.7715965868733653
R2 for k=4: 0.7763551069565278
R2 for k=5: 0.7803302014802322
R2 for k=6: 0.7797418730917955
R2 for k=7: 0.7780279535437731
R2 for k=8: 0.7807700857597751
R2 for k=9: 0.7811268254840067
R2 for k=10: 0.780669022131758
R2 for k=11: 0.7791182513845333
R2 for k=12: 0.779538408741077
R2 for k=13: 0.7783377099310291
R2 for k=14: 0.7773049532013959
R2 for k=15: 0.777643231802753
R2 for k=16: 0.777003751960934
R2 for k=17: 0.7749450015587371
R2 for k=18: 0.7742275624080939
R2 for k=19: 0.773788276174459
R2 for k=20: 0.7724723534417792


From above, the best score is 78.44% for k=7

In [46]:
X_test_neigh = neigh.kneighbors(X_test_norm, return_distance=False)
print(np.max(X_test_neigh)) #double check it is the max index 12966-1

12965


Above line is to get the neighbour points. What if we combine and use linear regression for neighbour points instead of just an average over these points? Let's have first trial.

In [47]:
X_in = X_train_norm.iloc[X_test_neigh[0]]
y_in = y_train_norm.iloc[X_test_neigh[0]]
reg=LinearRegression().fit(X_in, y_in)
print("intercept: "+str(reg.intercept_)+"; coef: "+str(reg.coef_))

print('y_norm predicted by KNN alone:'+str(y_norm(neigh.predict(X_test_norm.iloc[0].to_numpy().reshape((1,-1))))))
print('y_norm predicted by linear + KNN: '+str(reg.predict(X_test_norm.iloc[0].to_numpy().reshape((1,-1)))))
print('y_norm actual:'+str(y_test_norm.iloc[0]))

intercept: -0.06320524965063956; coef: [-9.86664162e-02  2.70220885e-02  9.96544271e-02  5.64127835e-01
 -7.37888845e-02 -4.99600361e-16 -5.55111512e-17  2.84451961e-02
  1.24673912e-01  1.10503928e-01 -5.55111512e-17  4.72215751e-01
  0.00000000e+00 -1.00762706e-03  3.30713309e-01 -2.40771475e-01
 -1.69463814e-01  8.29540581e-03]
y_norm predicted by KNN alone:[-0.4908857]
y_norm predicted by linear + KNN: [-0.86998597]
y_norm actual:-0.7531056036958254




In [48]:
reg_train=LinearRegression().fit(X_train_norm, stats.zscore(y_train))
print("intercept: "+str(reg_train.intercept_)+"; coef: "+str(reg_train.coef_))

intercept: 0.0001215915301839336; coef: [-1.01847780e-01  1.04787785e-01 -7.09464302e+11  5.35214915e-03
  1.65785753e-02  1.43477223e-01  1.06118200e-01  4.09608880e-02
  2.94091413e-01  6.39807651e+11  3.44174786e+11 -2.25749758e-01
  2.54216954e-02 -8.11319716e-02  2.24921366e-01 -6.79209795e-02
  3.06303446e-02 -2.12286962e-02]


The frist trial (using the first test record) of combining with linear regreesion has better performance than KNN alone!

Let's examine further. Small coefficients may imply insignificance of corresponding variables. As nearest neighbour method does not give different weight to each dimension, including feature that is not important may hurt the influence of other important features and overall performance. Try again by dropping sqft_lot, floors, yr_renovated, sqft_living15, sqft_lot15, or try each one generally

In [49]:
#X_train_drop = X_train.drop(['sqft_lot','floors','yr_renovated','sqft_living15','sqft_lot15'], axis=1)
#X_test_drop = X_test.drop(['sqft_lot','floors','yr_renovated','sqft_living15','sqft_lot15'], axis=1)

max_score={}
max_score_k={}
for feature in X_train.columns:  
    max_score[feature]=0
    max_score_k[feature]=0
    for k in range(1,21):
        X_train_drop = X_train.drop([feature], axis=1)
        X_test_drop = X_test.drop([feature], axis=1)
        neigh = KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='kd_tree')
        neigh.fit(stats.zscore(X_train_drop), y_train)
        temp = neigh.score(stats.zscore(X_test_drop), y_test)
        if temp > max_score[feature]:
            max_score[feature]=temp
            max_score_k[feature]=k
    print("R2 for k="+str(max_score_k[feature])+", drop "+feature+": "+str(max_score[feature]))

R2 for k=6, drop bedrooms: 0.7975488030016479
R2 for k=10, drop bathrooms: 0.7953977706514064
R2 for k=9, drop sqft_living: 0.7762346881663562
R2 for k=14, drop sqft_lot: 0.7833575901557553
R2 for k=8, drop floors: 0.7899235568700329
R2 for k=8, drop waterfront: 0.7838030258638968
R2 for k=6, drop view: 0.7974293862026502
R2 for k=6, drop condition: 0.7937168459899799
R2 for k=11, drop grade: 0.762062289397084
R2 for k=6, drop sqft_above: 0.7795204093309424
R2 for k=8, drop sqft_basement: 0.796507183569725
R2 for k=9, drop yr_built: 0.7824579734833068
R2 for k=5, drop yr_renovated: 0.7824411003719742
R2 for k=5, drop zipcode: 0.780012775185584
R2 for k=10, drop lat: 0.7253222683521564
R2 for k=11, drop long: 0.7678442100061611
R2 for k=9, drop sqft_living15: 0.7807638965257118
R2 for k=7, drop sqft_lot15: 0.7846138683863939


From above, generally droping one feature may not improve the score much, and it may hurt performance for dropping some important feature, notably such as lat, long, sqft_living, grade. Let's see what if only lat, long, sqft_living, grade are included?

In [50]:
X_train_drop = X_train[['lat','long']]
X_test_drop = X_test[['lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.5383854974562687


In [51]:
X_train_drop = X_train[['sqft_living','lat','long']]
X_test_drop = X_test[['sqft_living','lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.7863773086534392


In [52]:
X_train_drop = X_train[['sqft_living','grade','lat','long']]
X_test_drop = X_test[['sqft_living','grade','lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.81609345393923


We should note that only lat, long already gives the same performance vs all the features! 

And it may help if only add some more features selectively.

In [53]:
print("R2 score after adding one feature at a time to sqft_living, lat, long")
print("----------------")
for feature in set(X_train.columns) - {'sqft_living','lat','long'}:
    X_train_drop = X_train[['sqft_living','lat','long',feature]]
    X_test_drop = X_test[['sqft_living','lat', 'long',feature]]
    neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
    neigh.fit(stats.zscore(X_train_drop), y_train)
    print(feature+": "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

R2 score after adding one feature at a time to sqft_living, lat, long
----------------
sqft_lot: 0.8021303644257914
yr_renovated: 0.7878858176938117
zipcode: 0.7849425736206364
sqft_lot15: 0.7992307690090501
floors: 0.7855792138437818
bathrooms: 0.7674644490826882
bedrooms: 0.7790101739960544
sqft_basement: 0.7719492347429023
sqft_above: 0.7843465828076341
yr_built: 0.796953104249974
condition: 0.7803071710924393
grade: 0.81609345393923
sqft_living15: 0.8010770074400841
waterfront: 0.82151286900326
view: 0.8048157669161944


Unexpected, sparsely variable waterfront is the best! Let's further add selectively according to the above scores.

In [54]:
try_feature = ['sqft_living','lat','long','waterfront', 'grade']
X_train_drop = X_train[try_feature]
X_test_drop = X_test[try_feature]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(str(try_feature)+" R2 score: "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

['sqft_living', 'lat', 'long', 'waterfront', 'grade'] R2 score: 0.8482072082067893


In [55]:
try_feature = ['sqft_living','lat','long','waterfront', 'grade','view']
X_train_drop = X_train[try_feature]
X_test_drop = X_test[try_feature]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(str(try_feature)+" R2 score: "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

['sqft_living', 'lat', 'long', 'waterfront', 'grade', 'view'] R2 score: 0.8461563078069392


In [56]:
try_feature = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
X_train_drop = X_train[try_feature]
X_test_drop = X_test[try_feature]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(str(try_feature)+" R2 score: "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

['sqft_living', 'lat', 'long', 'waterfront', 'grade', 'sqft_lot'] R2 score: 0.8551000169819332


In [57]:
def metrics(y_predict, y_true):  #given predicted and true values, return dictionary of R2, RMSE, RMSE/mean, average abs % error
    metrics={}
    residual = y_predict - y_true
    average = np.average(y_true)
    abs_percent_error = np.absolute(np.divide(residual,y_true))*100
    metrics['R2'] = 1 - (np.sum(residual**2)/len(residual))/np.var(y_true)
    metrics['RMSE']= (np.sum(residual**2)/len(residual))**0.5
    metrics['RMSE/mean'] = metrics['RMSE'] / average
    metrics['average absolute % error'] = np.average(abs_percent_error)
    metrics['median absolute % error'] = np.median(abs_percent_error)
    return metrics


### The best features are ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']

In [58]:
metrics(neigh.predict(X_test_norm[['sqft_living','lat','long','waterfront', 'grade','sqft_lot']]), y_test)

{'R2': 0.8538285027338658,
 'RMSE': 135207.70547762708,
 'RMSE/mean': 0.2527154921702753,
 'average absolute % error': 14.554465885766795,
 'median absolute % error': 9.79801363924772}

In [68]:
y_linear_knn =[]
knn_features = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
linear_features = ['sqft_living','grade','sqft_lot'] #exclude map-related info
X_test_neigh = neigh.kneighbors(X_test_norm[knn_features], return_distance=False)

for i in range(len(X_test)):
    X_in = X_train_norm[linear_features].iloc[X_test_neigh[i]]
    y_in = y_train_norm.iloc[X_test_neigh[i]]
    reg=LinearRegression().fit(X_in, y_in)
    y_linear_knn.append(reg.predict(X_test_norm[linear_features].iloc[i].to_numpy().reshape((1,-1))))

























































In [69]:
y_linear_knn_unnorm = np.squeeze(y_linear_knn)* (np.var(y_train)**0.5) + np.average(y_train)
metrics(y_linear_knn_unnorm, y_test)

{'R2': 0.7903768665118113,
 'RMSE': 161915.9749625127,
 'RMSE/mean': 0.3026356756690336,
 'average absolute % error': 16.258761249036244,
 'median absolute % error': 10.787623591335468}

In [70]:
y_linear_knn =[]
knn_features = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
linear_features = ['sqft_living','grade'] #exclude map-related info and further exclude 'sqft_lot' which is less important
X_test_neigh = neigh.kneighbors(X_test_norm[knn_features], return_distance=False)

for i in range(len(X_test)):
    X_in = X_train_norm[linear_features].iloc[X_test_neigh[i]]
    y_in = y_train_norm.iloc[X_test_neigh[i]]
    reg=LinearRegression().fit(X_in, y_in)
    y_linear_knn.append(reg.predict(X_test_norm[linear_features].iloc[i].to_numpy().reshape((1,-1))))























































In [71]:
y_linear_knn_unnorm = np.squeeze(y_linear_knn)* (np.var(y_train)**0.5) + np.average(y_train)
metrics(y_linear_knn_unnorm, y_test)

{'R2': 0.8177654787693934,
 'RMSE': 150968.18020041625,
 'RMSE/mean': 0.2821732520837138,
 'average absolute % error': 15.177657259421196,
 'median absolute % error': 10.44626825595106}

In [72]:
y_linear_knn =[]
knn_features = ['sqft_living','lat','long','waterfront', 'grade','sqft_lot']
linear_features = ['sqft_living'] #test by only one feature
X_test_neigh = neigh.kneighbors(X_test_norm[knn_features], return_distance=False)

for i in range(len(X_test)):
    X_in = X_train_norm[linear_features].iloc[X_test_neigh[i]]
    y_in = y_train_norm.iloc[X_test_neigh[i]]
    reg=LinearRegression().fit(X_in, y_in)
    y_linear_knn.append(reg.predict(X_test_norm[linear_features].iloc[i].to_numpy().reshape((1,-1))))



















































In [73]:
y_linear_knn_unnorm = np.squeeze(y_linear_knn)* (np.var(y_train)**0.5) + np.average(y_train)
metrics(y_linear_knn_unnorm, y_test)

{'R2': 0.8353706373758195,
 'RMSE': 143490.696396135,
 'RMSE/mean': 0.26819715513628883,
 'average absolute % error': 15.086716188353362,
 'median absolute % error': 10.460427900770604}