In [2]:
import argparse
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

def pca_95(xTrain, xTest):
    pca = sklearnPCA(.95)
    pca.fit(xTrain)
    xTrain_pca = pca.transform(xTrain)
    xTest_pca = pca.transform(xTest)
    return xTrain_pca, xTest_pca

def PCA_knn(xTrain, yTrain, xTest, yTest):
    #yTrain = np.ravel(yTrain)
    knn = KNeighborsRegressor(n_neighbors=12, weights='uniform')
    knn.fit(xTrain, yTrain)

    yHat = knn.predict(xTest)
    print(mean_squared_error(yTest, yHat, squared=False))
    print(r2_score(yTest, yHat))
    print(knn.score(xTest, yTest))

def Param_search(xTrain, yTrain, xTest, yTest):
    clf = GridSearchCV(
        estimator=KNeighborsRegressor(),
        param_grid=[{'n_neighbors': range(1, 20, 1), 'weights': ('uniform', 'distance')}], cv=3, scoring='r2')
    clf.fit(xTrain, yTrain)
    yHat = clf.predict(xTest)

    means = clf.cv_results_['mean_test_score']
    for mean, params in zip(means, clf.cv_results_['params']):
        print("%0.3f for %r" % (mean, params))
    print()
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)


def file_to_numpy(filename):
    """
    Read an input file and convert it to numpy
    """
    df = pd.read_csv(filename)
    return df.to_numpy()

In [4]:
"""
Main file to run from the command line.
"""
#xTrain = pd.read_csv('xTrain.csv')
#yTrain = pd.read_csv('yTrain.csv')
file = pd.read_csv("train.csv")
y = file.iloc[:, 1:2]
x = pd.read_csv("x_transformed.csv")
x = x.iloc[:, 1:]

xHatNames = x.head(0)
xHatNames = xHatNames.T
x = x.to_numpy()
y = y.to_numpy()
y = y.T[0]
# print(y)
# print(len(x[0]))
# print(xHatNames)

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.3, random_state=40)
xTrain, xTest = pca_95(xTrain, xTest)
xTrain, xTest = pca_95(xTrain, xTest)

PCA_knn(xTrain, yTrain, xTest, yTest)
Param_search(xTrain, yTrain, xTest, yTest)

10.38591881709924
0.37694032882417305
0.37694032882417305
0.103 for {'n_neighbors': 1, 'weights': 'uniform'}
0.103 for {'n_neighbors': 1, 'weights': 'distance'}
0.338 for {'n_neighbors': 2, 'weights': 'uniform'}
0.327 for {'n_neighbors': 2, 'weights': 'distance'}
0.402 for {'n_neighbors': 3, 'weights': 'uniform'}
0.391 for {'n_neighbors': 3, 'weights': 'distance'}
0.445 for {'n_neighbors': 4, 'weights': 'uniform'}
0.428 for {'n_neighbors': 4, 'weights': 'distance'}
0.462 for {'n_neighbors': 5, 'weights': 'uniform'}
0.444 for {'n_neighbors': 5, 'weights': 'distance'}
0.470 for {'n_neighbors': 6, 'weights': 'uniform'}
0.451 for {'n_neighbors': 6, 'weights': 'distance'}
0.471 for {'n_neighbors': 7, 'weights': 'uniform'}
0.453 for {'n_neighbors': 7, 'weights': 'distance'}
0.475 for {'n_neighbors': 8, 'weights': 'uniform'}
0.456 for {'n_neighbors': 8, 'weights': 'distance'}
0.475 for {'n_neighbors': 9, 'weights': 'uniform'}
0.457 for {'n_neighbors': 9, 'weights': 'distance'}
0.474 for {'n_n