This notebook performs knn linear regression on our honey production and air quality combined dataset, which is read in in the second cell<br>
Nearest neighbors are found using only the geographical (latitude and longitude) features - so it is not knn in the classical sense<br>
We use the matrix solution to minimize the MSE<br>
Cross validation is performed and an MSE is output near the end. Everything is written from scratch with numpy

In [None]:
import pandas as pd
from scipy.stats import norm
import numpy as np
import math
np.set_printoptions(suppress=True) # disable scientific notation when printing

In [None]:
data = pd.read_csv("data/completeFeatureVectors.csv")

X = data[['o3','co','so2','no2','pm25_frm', 'pressure', 'temperature', 'wind', 'year', 'latitude', 'longitude']].to_numpy()
# subtract 1998 from the year so that it starts at zero
X[:,8] = X[:,8]-1998
# Append ones to the start of X for the bias term
X = np.append(np.ones((X.shape[0],1)), X, axis=1)
y = data[['yield_per_col']].to_numpy()

The predict function makes a prediction for the query given the training data X and responses y

In [None]:
def predict(query, X, y, k):
    distances = []
    for x in X:
        distances = distances + np.linalg.norm(query[10:]-x[10:]) #TODO should be [9:11]? or [10:11] b/c of bias?
    
    # determine the cutoff for being one of k nearest neighbors
    # np.sort returns a copy of distances so distances isn't affected
    cutoff = np.sort(distances)[k]
    # find the neighbors with a distance less than the cutoff
    X_neighbors = X[distances<cutoff][:10]
    y_neighbors = y[distances<cutoff][:10]

    # find theta with matrix formula to minimize MSE
    theta = np.matmul(np.matmul(np.linalg.inv(np.matmul(X_neighbors.T, X_neighbors)),X_neighbors.T),y_neighbors)

    # return prediction found via theta
    return np.matmul(query[:10].T, theta)

Perform k-fold cross validation to find our mean squared error (MSE)

In [None]:
k = 10

# the size of the testing set for each fold
chunk_size = X.shape[0] // k

# shuffle X and y together
Xy_shuffled = np.append(X, y, axis=1)
np.random.shuffle(Xy_shuffled)

sq_errors = []

# iterate through k folds
for i in range(k):

    # split out testing and training data
    X_k_test = Xy_shuffled[chunk_size*i:chunk_size*(i+1),:9]
    y_k_test = Xy_shuffled[chunk_size*i:chunk_size*(i+1),9]

    if i == 0:
        X_k_train = Xy_shuffled[chunk_size:,:9]
        y_k_train = Xy_shuffled[chunk_size:,9]
    elif i == k-1:
        X_k_train = Xy_shuffled[:chunk_size*i,:9]
        y_k_train = Xy_shuffled[:chunk_size*i,9]
    else:
        X_k_train = np.append(Xy_shuffled[:chunk_size*i,:9], Xy_shuffled[chunk_size*(i+1):,:9], axis=0)
        y_k_train = np.append(Xy_shuffled[:chunk_size*i,9], Xy_shuffled[chunk_size*(i+1):,9], axis=0)

    for i in range(X_k_test.shape[0]):
        y_pred = predict(X_k_test[i,:], X_k_train, y_k_train, 10)
        y_pred = np.matmul(X_k_test[i,:].T, theta_k)
        #print("predicted:", y_pred, "actual", y_actual)
        sq_errors.append((y_pred - y_actual)**2)
    
mean_sq_error = np.mean(sq_errors)
print(mean_sq_error)

    