In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
# Read file, encode dummies
trainData = pd.read_csv('abaloneTrain700.csv')
trainData = pd.concat((trainData, pd.get_dummies(trainData['sex'])), axis=1)
y_train = trainData['age']
trainData = trainData.drop(['sex', 'age'], axis=1)

testData = pd.read_csv('abaloneTest700.csv')
testData = pd.concat((testData, pd.get_dummies(testData['sex'])), axis=1)
y_test = testData['age']
testData = testData.drop(['sex', 'age'], axis=1)

In [3]:
# Scale data
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()
Scaler.fit(trainData)
trainData_scaled = Scaler.transform(trainData)
testData_scaled = Scaler.transform(testData)

In [4]:
# My own knn regression
def knnPredict(featuresNewInstance, trainDataset, y_train, k=1):
    # Euclidean distance
    
    #dist = np.array([np.sqrt(sum((featuresNewInstance-trainDataset[i,:])**2)) 
    #        for i in range(trainDataset.shape[0])])
    
    dist = np.sqrt(((featuresNewInstance-trainDataset)**2).sum(axis=1)) # OPTIMIZED BUT SAME RESULT

    nn = dist.argsort()[:k] # You only need to this to get the indices of the k nearest neighbors,
    # since argsort already returns indices (look up the difference between .sort() and .argsort() if necessary)
    
    prediction = np.mean(y_train.iloc[nn])     # average of the nearest neighbors
    return prediction

# This is very slow if we do it on the whole dataset, so just do it on 10 data points
my_predictions = [knnPredict(testData_scaled[i,:], trainData_scaled, y_train, k=3)
          for i in range(10)]
print(my_predictions)

[12.333333333333334, 13.333333333333334, 13.333333333333334, 7.0, 9.0, 10.333333333333334, 11.0, 6.666666666666667, 13.666666666666666, 7.666666666666667]


In [5]:
# Using the one implemented by sklearn
from sklearn import neighbors

knn = neighbors.KNeighborsRegressor(n_neighbors=3, metric = 'euclidean')
knn.fit(trainData_scaled, y_train)
y_pred = knn.predict(testData_scaled)
y_pred_train = knn.predict(trainData_scaled)

In [6]:
print("MSE for test data:", mean_squared_error(y_pred, y_test))
print("MSE for training data:", mean_squared_error(y_pred_train, y_train))

MSE for test data: 6.7280952380952375
MSE for training data: 3.5119047619047614


In [7]:
print("Sklearn\t\tMy implementation")
for i in range(10):
    print("{:.2f}\t\t{:.2f}".format(y_pred[i], my_predictions[i]))
print("MSE for sklearn: {:.2f}".format(mean_squared_error(y_pred[:10], y_test[:10])))
print("MSE for my implementation: {:.2f}".format(mean_squared_error(my_predictions, y_test[:10])))

Sklearn		My implementation
12.33		12.33
13.33		13.33
13.33		13.33
7.00		7.00
9.00		9.00
10.33		10.33
11.00		11.00
6.67		6.67
13.67		13.67
7.67		7.67
MSE for sklearn: 8.01
MSE for my implementation: 8.01
