# Nearest Neighbour

The purpose of this sectionn is to use nearest neighbour algorithm to predict house price. First, import and load source data.

In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor

path = './archive/'

kc_data = pd.read_csv(path + '/kc_house_data.csv')

Clean data. Remove obviously wrong record with 33 bedrooms, detected in "Outlier checking.ipynb". Then split data into train, test, final test data by 60/20/20 ratio.

In [57]:
data = kc_data.drop(['id','date'],axis = 1)
#remove obviously wrong record with 33 bedrooms, detected in "Outlier checking.ipynb"
data = data.drop(labels=15870, axis=0) 

y = data['price']
X = data.drop('price', axis=1)

# split the train, test and final test data
X_train, X_final_test, y_train, y_final_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42) 


#-------------above: data clean and split--------------------------------------------

Then transform input data by standard normal distribution.

In [58]:
def norm(X):
        X_new = stats.zscore(X)
        X_mean = np.average(X, axis=0)
        X_sigma = np.var(X, axis=0)**0.5
        return X_new, X_mean, X_sigma

In [59]:
X_train_norm, X_train_mean, X_train_sigma = norm(X_train)
X_test_norm, X_test_mean, X_test_sigma = norm(X_test)


Then fit to nearest neighbour model. Select KD tree algorithm for the model as it is good for large number of samples with dimension < 20.

In [90]:
for k in range(1,21):
    neigh = KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='kd_tree')
    neigh.fit(X_train_norm, y_train)
    print("R2 for k="+str(k)+": "+str(neigh.score(X_test_norm, y_test)))

R2 for k=1: 0.7005855495690386
R2 for k=2: 0.7592114295539327
R2 for k=3: 0.7774861182320841
R2 for k=4: 0.7806656321890751
R2 for k=5: 0.7836042908088774
R2 for k=6: 0.7824438412445217
R2 for k=7: 0.7843952394509026
R2 for k=8: 0.7837599077635617
R2 for k=9: 0.7830045977629589
R2 for k=10: 0.7835484054437615
R2 for k=11: 0.7843376108619008
R2 for k=12: 0.7825429123491603
R2 for k=13: 0.7816034498764851
R2 for k=14: 0.7819417775555662
R2 for k=15: 0.7811160239055794
R2 for k=16: 0.7784664752036163
R2 for k=17: 0.778367886448687
R2 for k=18: 0.77619155250444
R2 for k=19: 0.775395468224563
R2 for k=20: 0.7746720561652046


From above, the best score is 78.44% for k=7

In [61]:
X_test_neigh = neigh.kneighbors(X_test_norm, return_distance=False)

Above line is to get the neighbour points. What if we combine and use linear regression for neighbour points instead of just an average over these points? Let's have first trial.

In [73]:
y_np = (y_train).to_numpy()

X_in = X_train_norm[X_test_neigh[0]]
y_in = y_np[X_test_neigh[0]]
print(X_in)
print(y_in)
reg=LinearRegression().fit(X_in, y_in)
print("intercept: "+str(reg.intercept_)+"; coef: "+str(reg.coef_))

reg.predict(X_test_norm[0].reshape(1, -1))

[[ 0.70049729 -0.15281684  0.92981188 -0.05545212 -0.0027696  -0.08681874
  -0.30730473  0.91485881  0.28837471  1.38359559 -0.65538417 -0.21151453
  -0.21205709  0.26781398 -1.91088736  0.18483703 -0.0393768  -0.06849211]
 [ 1.81438591  0.82357436  0.90792454  0.03051201 -0.0027696  -0.08681874
  -0.30730473  0.91485881  0.28837471  1.35932534 -0.65538417 -0.14352996
  -0.21205709 -0.37059166 -0.86860437  0.48225448 -0.01003497  0.17411628]
 [ 0.70049729  0.17264689  0.06526164 -0.13883973  0.9180156  -0.08681874
  -0.30730473  0.91485881  0.28837471  0.42492069 -0.65538417  0.50232341
  -0.21205709  0.26781398 -2.13335664 -0.03468538  0.06331961 -0.17382802]
 [ 1.81438591  0.49811062  0.99547393 -0.18904567  0.9180156  -0.08681874
  -0.30730473  0.91485881  0.28837471  1.45640634 -0.65538417  0.63829254
  -0.21205709 -0.67101785 -1.44355743  0.86464835  0.23937061 -0.16204026]
 [ 1.81438591  0.49811062  0.60150166 -0.16067992 -0.0027696  -0.08681874
  -0.30730473  0.91485881  0.28837

array([-1.24710544e+16])

In [75]:
reg_train=LinearRegression().fit(X_train_norm, stats.zscore(y_train))
print("intercept: "+str(reg_train.intercept_)+"; coef: "+str(reg_train.coef_))

intercept: -1.2085540425412828e-14; coef: [-0.09930007  0.10377816  0.23060216  0.00517823  0.01613386  0.14345411
  0.10632847  0.04102515  0.29434998  0.21008498  0.08481163 -0.22573833
  0.02536397 -0.08093204  0.22477773 -0.06779121  0.03083645 -0.0212118 ]


The frist trial of combining with linear regreesion has poor performance. Let's examine further.
Small coefficients may imply insignificance of corresponding variables. As nearest neighbour method does not give different weight to each dimension, including feature that is not important may hurt the influence of other important features and overall performance. Try again by dropping sqft_lot, floors, yr_renovated, sqft_living15, sqft_lot15, or try each one generally

In [94]:
#X_train_drop = X_train.drop(['sqft_lot','floors','yr_renovated','sqft_living15','sqft_lot15'], axis=1)
#X_test_drop = X_test.drop(['sqft_lot','floors','yr_renovated','sqft_living15','sqft_lot15'], axis=1)

max_score={}
max_score_k={}
for feature in X_train.columns:  
    max_score[feature]=0
    max_score_k[feature]=0
    for k in range(1,21):
        X_train_drop = X_train.drop([feature], axis=1)
        X_test_drop = X_test.drop([feature], axis=1)
        neigh = KNeighborsRegressor(n_neighbors=k, weights='distance', algorithm='kd_tree')
        neigh.fit(stats.zscore(X_train_drop), y_train)
        temp = neigh.score(stats.zscore(X_test_drop), y_test)
        if temp > max_score[feature]:
            max_score[feature]=temp
            max_score_k[feature]=k
    print("R2 for k="+str(max_score_k[feature])+", drop "+feature+": "+str(max_score[feature]))

R2 for k=6, drop bedrooms: 0.7975429546890449
R2 for k=10, drop bathrooms: 0.7954220854144363
R2 for k=9, drop sqft_living: 0.7762594473811026
R2 for k=14, drop sqft_lot: 0.7833599265897042
R2 for k=8, drop floors: 0.7899579497307077
R2 for k=8, drop waterfront: 0.7838341129683302
R2 for k=6, drop view: 0.7974347523050592
R2 for k=6, drop condition: 0.7937152075021774
R2 for k=11, drop grade: 0.762063498760535
R2 for k=6, drop sqft_above: 0.7795195212009083
R2 for k=8, drop sqft_basement: 0.7964918854811512
R2 for k=9, drop yr_built: 0.7824780945597906
R2 for k=5, drop yr_renovated: 0.7824276983253649
R2 for k=5, drop zipcode: 0.779982255648137
R2 for k=10, drop lat: 0.7253785710594872
R2 for k=11, drop long: 0.7678089112264762
R2 for k=9, drop sqft_living15: 0.7807885550096539
R2 for k=7, drop sqft_lot15: 0.7846214974084792


From above, generally droping one feature may not improve the score much, and it may hurt performance for dropping some important feature, notably such as lat, long, sqft_living, grade. Let's see what if only lat, long, sqft_living, grade are included?

In [100]:
X_train_drop = X_train[['lat','long']]
X_test_drop = X_test[['lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.5385006155130563


In [98]:
X_train_drop = X_train[['sqft_living','lat','long']]
X_test_drop = X_test[['sqft_living','lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.7863816045518219


In [99]:
X_train_drop = X_train[['sqft_living','grade','lat','long']]
X_test_drop = X_test[['sqft_living','grade','lat', 'long']]
neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
neigh.fit(stats.zscore(X_train_drop), y_train)
print(neigh.score(stats.zscore(X_test_drop), y_test))

0.8161225057554446


We should note that only lat, long already gives the same performance vs all the features! 

And it may help if only add some more features selectively.

In [102]:
for feature in set(X_train.columns) - {'sqft_living','lat','long'}:
    X_train_drop = X_train[['sqft_living','lat','long',feature]]
    X_test_drop = X_test[['sqft_living','lat', 'long',feature]]
    neigh = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='kd_tree')
    neigh.fit(stats.zscore(X_train_drop), y_train)
    print(feature+" added to sqft_living, lat, long. R2 score: "+str(neigh.score(stats.zscore(X_test_drop), y_test)))

waterfront added to sqft_living, lat, long. R2 score: 0.821509437622671
yr_built added to sqft_living, lat, long. R2 score: 0.796920115328263
sqft_basement added to sqft_living, lat, long. R2 score: 0.7719446241452717
view added to sqft_living, lat, long. R2 score: 0.8047982179218391
sqft_lot added to sqft_living, lat, long. R2 score: 0.8021446280084724
sqft_above added to sqft_living, lat, long. R2 score: 0.7843511287039105
yr_renovated added to sqft_living, lat, long. R2 score: 0.7878778863105822
zipcode added to sqft_living, lat, long. R2 score: 0.7849201260325311
sqft_lot15 added to sqft_living, lat, long. R2 score: 0.7992146886212602
bedrooms added to sqft_living, lat, long. R2 score: 0.7790371290613699
sqft_living15 added to sqft_living, lat, long. R2 score: 0.8010795758186134
floors added to sqft_living, lat, long. R2 score: 0.7855795667937342
bathrooms added to sqft_living, lat, long. R2 score: 0.7674873434583119
grade added to sqft_living, lat, long. R2 score: 0.81612250575544