In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv(r"D:\regression\Project 7\data\kc_house_data_small.csv",dtype=dtype_dict)
train_data=pd.read_csv(r"D:\regression\Project 7\data\kc_house_data_small_train.csv",dtype=dtype_dict)
test_data=pd.read_csv(r"D:\regression\Project 7\data\kc_house_data_small_test.csv",dtype=dtype_dict)
validation_data=pd.read_csv(r"D:\regression\Project 7\data\kc_house_data_validation.csv",dtype=dtype_dict)

In [4]:
def get_numpy_data(df,features,output):
    '''
    i/p: df: data table in data frame format
            features: x input name list. According to this list a feature matrix will be created in numpy 2D aray
            output: single target name which will be converted into numpy vector
    o/p: feature matrix, y vector in numpy array format'''
    df['constant']=1   #add a constant field for the intercept
    features=['constant']+features
    feature_matrix=df.as_matrix(columns=features)
    y_vector=df[output].values
    return feature_matrix,y_vector

In [5]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix,axis=0)
    normlized_feature_matrix = feature_matrix / norms
    return (normlized_feature_matrix, norms)

In [6]:
features = [m for m,n in dtype_dict.items() if train_data[m].dtypes != 'object']
features.remove('price')
features

['bathrooms',
 'waterfront',
 'sqft_above',
 'sqft_living15',
 'grade',
 'yr_renovated',
 'bedrooms',
 'long',
 'sqft_lot15',
 'sqft_living',
 'floors',
 'condition',
 'lat',
 'sqft_basement',
 'yr_built',
 'sqft_lot',
 'view']

In [7]:
train_feature_matrix,training_output=get_numpy_data(train_data,features,'price')
test_feature_matrix,test_output=get_numpy_data(test_data,features,'price')
validation_feature_matrix,validation_output=get_numpy_data(validation_data,features,'price')

In computing distances, it is crucial to normalize features. Otherwise, for example, the ‘sqft_living’ feature (typically on the order of thousands) would exert a much larger influence on distance than the ‘bedrooms’ feature (typically on the order of ones).

In [8]:
train_norm_feature_matrix, norms = normalize_features(train_feature_matrix)
test_norm_feature_matrix = test_feature_matrix / norms
validation_norm_feature_matrix = validation_feature_matrix / norms

## Compute a single distance

In [9]:
print(test_norm_feature_matrix[0]) #query house

[ 0.01345102  0.01807473  0.          0.01362084  0.01375926  0.01564352
  0.          0.01551285 -0.01346922  0.0016225   0.01759212  0.017059
  0.0116321   0.01345387  0.02481682  0.01350306  0.00160518  0.05102365]


In [10]:
print(train_norm_feature_matrix[9]) #10th house in training data set

[ 0.01345102  0.00602491  0.          0.0096309   0.01195898  0.01390535
  0.          0.01163464 -0.01346251  0.00156612  0.0083488   0.01279425
  0.01938684  0.01346821  0.          0.01302544  0.00050756  0.        ]


In [11]:
print("Euclidean distance b/w above 2 houses is",np.sqrt(np.sum((test_norm_feature_matrix[0]-train_norm_feature_matrix[9])**2)))

Euclidean distance b/w above 2 houses is 0.05972359371398078


Euclidean distance from the query house to each of the first 10 houses in the training set.

In [12]:
distance=[(np.sqrt(np.sum((test_norm_feature_matrix[0]-train_norm_feature_matrix[i])**2)),i) for i in range(10)]

In [13]:
i,j=min(distance)
print('Among the first 10 training houses, %dth house is the closest to the query house and the distance b/w them is %e'%(j,i))

Among the first 10 training houses, 8th house is the closest to the query house and the distance b/w them is 5.238363e-02


In [14]:
#test vectorization
for i in range(3):
    print(train_norm_feature_matrix[i]-test_norm_feature_matrix[0])
    # should print 3 vectors of length 18
print("")
print (train_norm_feature_matrix[0:3] - test_norm_feature_matrix[0])

# verify that vectorization works
results = train_norm_feature_matrix[0:3] - test_norm_feature_matrix[0]
print (results[0] - (train_norm_feature_matrix[0]-test_norm_feature_matrix[0]))
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print (results[1] - (train_norm_feature_matrix[1]-test_norm_feature_matrix[0]))
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print (results[2] - (train_norm_feature_matrix[2]-test_norm_feature_matrix[0]))
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.00000000e+00 -1.20498190e-02  0.00000000e+00 -5.50336860e-03
 -5.14364795e-03 -3.47633726e-03  0.00000000e+00 -3.87821276e-03
  1.29876855e-05  6.69281453e-04 -1.05552733e-02 -8.52950206e-03
  0.00000000e+00 -1.70254220e-05 -2.48168183e-02 -1.63756198e-04
  2.08673616e-04 -5.10236549e-02]
[ 0.00000000e+00 -4.51868214e-03  0.00000000e+00  1.30705004e-03
 -2.89330197e-03 -3.47633726e-03  6.65082271e-02 -3.87821276e-03
  6.16364736e-06  1.47606982e-03 -2.26610387e-03  0.00000000e+00
  0.00000000e+00  4.23090220e-05 -1.45830788e-02 -1.91048898e-04
  7.19763456e-04 -5.10236549e-02]
[ 0.00000000e+00 -1.20498190e-02  0.00000000e+00 -8.32384500e-03
  3.72914476e-03 -5.21450589e-03  0.00000000e+00 -7.75642553e-03
  1.56292487e-05  1.64764925e-03 -1.30002801e-02 -8.52950206e-03
  0.00000000e+00  4.70885840e-05 -2.48168183e-02 -3.13866046e-04
  1.60518166e-03 -5.10236549e-02]

[[ 0.00000000e+00 -1.20498190e-02  0.00000000e+00 -5.50336860e-03
  -5.14364795e-03 -3.47633726e-03  0.00000000e+00 -

## nearest neighbor regression

In [15]:
diff=train_norm_feature_matrix-test_norm_feature_matrix[0]

In [16]:
#test
np.sum(diff**2, axis=1)[15]==np.sum(diff[15]**2)

True

In [17]:
#test
np.sum(diff**2, axis=1)

array([0.00363304, 0.00730492, 0.00378218, ..., 0.0032681 , 0.00325555,
       0.00341846])

In [18]:
distance=np.sqrt(np.sum(diff**2,axis=1))
distance[100]

0.023708232416678198

In [19]:
def compute_distances(features_instances, features_query):
    diff=features_instances-features_query
    distances=np.sqrt(np.sum(diff**2,axis=1))
    return distances

In [20]:
print("3rd house in the test data is closest to the house in the training data with index:",
      compute_distances(train_norm_feature_matrix,test_norm_feature_matrix[2]).argmin())

3rd house in the test data is closest to the house in the training data with index: 382


In [21]:
print("predicted value of the query house based on 1-nearest neighbor regression is",training_output[382])

predicted value of the query house based on 1-nearest neighbor regression is 249000.0


## Perform k-nearest neighbor regression

In [22]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances=compute_distances(feature_train,features_query)
    return np.argsort(distances)[0:k]

In [23]:
print("4 training houses that are closest to the 3rd house in the test set are",
       k_nearest_neighbors(4,train_norm_feature_matrix,test_norm_feature_matrix[2]))

4 training houses that are closest to the 3rd house in the test set are [ 382 1149 4087 3142]


In [24]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbors=k_nearest_neighbors(k,features_train,features_query)
    prediction=np.mean(output_train[neighbors])
    return prediction

In [25]:
print("predicted value of 3rd house of the test set using k equal to 4 is",
     predict_output_of_query(4,train_norm_feature_matrix,training_output,test_norm_feature_matrix[2]))

predicted value of 3rd house of the test set using k equal to 4 is 413987.5


In [26]:
def predict_output(k, features_train, output_train, features_query):
    predictions=[predict_output_of_query(k, features_train, output_train, query_house) for query_house in features_query]
    return np.array(predictions)

In [27]:
predictions=predict_output(10, train_norm_feature_matrix, training_output, test_norm_feature_matrix[:10])

In [28]:
print("predictions for the first 10 houses in the test set, using k=10.")
print(predictions)
print("%dth house in this query set that has the lowest predicted value."%predictions.argmin())

predictions for the first 10 houses in the test set, using k=10.
[881300.  431860.  460595.  430200.  766750.  667420.  350032.  512800.7
 484000.  457235. ]
6th house in this query set that has the lowest predicted value.


In [29]:
print("The predicted value of this house is",predictions[6])

The predicted value of this house is 350032.0


## Choosing the best value of k using a validation set

In [30]:
rss_min=float('inf'),0
for k in range(1,15):
    predictions=predict_output(k,train_norm_feature_matrix,training_output,validation_norm_feature_matrix)
    rss=np.sum((validation_output-predictions)**2)
    if rss_min[0]>rss:
        rss_min=rss,k

In [31]:
print("minimum validation error is %e for k equal to %d"%rss_min)

minimum validation error is 6.736168e+13 for k equal to 8


In [32]:
predictions_test=predict_output(8,train_norm_feature_matrix,training_output,test_norm_feature_matrix)

In [33]:
rss_test=np.sum((test_output-predictions_test)**2)

In [34]:
print("rss for test data is %e"%rss_test)

rss for test data is 1.331188e+14
