In [1]:
import math
import numpy.random
import pandas
with open("nba_2013.csv", 'r') as csvfile:
    nba = pandas.read_csv(csvfile)

In [2]:
def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

In [3]:
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

In [4]:
nba_numeric = nba[distance_columns]
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

In [5]:
from scipy.spatial import distance
nba_normalized.fillna(0, inplace=True)
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

In [6]:
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

In [7]:
df = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
df.sort_values("dist", inplace=True)
second_smallest = df.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

In [8]:
from numpy.random import permutation
random_indices = permutation(nba.index)
test_cutoff = math.floor(len(nba)/3)
test = nba.loc[random_indices[1:test_cutoff]]
train = nba.loc[random_indices[test_cutoff:]]

In [9]:
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
y_column = ["pts"]

In [10]:
# Replace NaN with 0
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [11]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(train[x_columns], train[y_column])
predictions = knn.predict(test[x_columns])
print(predictions)

[[1.4142e+03]
 [6.0640e+02]
 [2.7400e+02]
 [5.2700e+02]
 [5.3600e+01]
 [1.0000e+00]
 [7.0360e+02]
 [3.7480e+02]
 [3.5600e+01]
 [1.2420e+02]
 [2.9860e+02]
 [3.0420e+02]
 [2.6820e+02]
 [9.4800e+01]
 [1.3456e+03]
 [1.1038e+03]
 [3.2800e+02]
 [9.7360e+02]
 [7.8400e+01]
 [1.6280e+02]
 [6.2900e+02]
 [4.2780e+02]
 [1.0882e+03]
 [3.5740e+02]
 [1.1868e+03]
 [5.8640e+02]
 [9.5500e+02]
 [1.0976e+03]
 [7.7820e+02]
 [2.9800e+01]
 [2.0000e+00]
 [9.6360e+02]
 [1.0760e+02]
 [1.1506e+03]
 [7.7540e+02]
 [8.3000e+01]
 [3.5580e+02]
 [7.3000e+01]
 [1.8400e+03]
 [2.7740e+02]
 [1.2000e+02]
 [3.8720e+02]
 [1.0700e+02]
 [9.3800e+01]
 [1.0000e+00]
 [1.3200e+01]
 [1.3554e+03]
 [7.1400e+01]
 [5.2560e+02]
 [9.2180e+02]
 [8.0960e+02]
 [3.5120e+02]
 [7.7480e+02]
 [4.1940e+02]
 [4.2240e+02]
 [5.6200e+02]
 [5.7600e+01]
 [6.0600e+01]
 [2.1660e+02]
 [1.0200e+01]
 [8.4760e+02]
 [6.0740e+02]
 [1.5172e+03]
 [1.2000e+02]
 [2.8240e+02]
 [3.2520e+02]
 [2.0040e+02]
 [1.5468e+03]
 [2.0600e+01]
 [1.0580e+03]
 [9.1080e+02]
 [6.24

In [12]:
actual = test[y_column].values
print(actual)

[[1289]
 [ 638]
 [ 329]
 [ 461]
 [  71]
 [   5]
 [ 660]
 [ 339]
 [  36]
 [ 159]
 [ 286]
 [ 315]
 [ 339]
 [ 136]
 [1372]
 [ 987]
 [ 341]
 [1053]
 [ 107]
 [ 137]
 [ 696]
 [ 503]
 [1113]
 [ 350]
 [1119]
 [ 625]
 [ 911]
 [1090]
 [ 831]
 [  38]
 [   2]
 [1041]
 [ 143]
 [1131]
 [ 850]
 [  92]
 [ 384]
 [ 102]
 [1930]
 [ 273]
 [ 115]
 [ 350]
 [  79]
 [  98]
 [   0]
 [   6]
 [1248]
 [  84]
 [ 511]
 [ 988]
 [ 890]
 [ 352]
 [ 821]
 [ 548]
 [ 352]
 [ 525]
 [  73]
 [  62]
 [ 249]
 [   6]
 [ 844]
 [ 618]
 [1603]
 [ 145]
 [ 261]
 [ 306]
 [ 200]
 [1695]
 [  25]
 [ 990]
 [ 859]
 [ 772]
 [  42]
 [ 140]
 [ 200]
 [ 514]
 [  94]
 [ 383]
 [ 761]
 [ 252]
 [1007]
 [ 759]
 [1298]
 [ 194]
 [ 846]
 [ 603]
 [1068]
 [ 915]
 [ 144]
 [   9]
 [ 701]
 [1873]
 [1583]
 [1002]
 [  83]
 [   0]
 [ 579]
 [  68]
 [ 480]
 [ 558]
 [ 779]
 [ 298]
 [ 810]
 [   6]
 [ 425]
 [ 301]
 [1042]
 [ 273]
 [ 754]
 [ 485]
 [   6]
 [   0]
 [ 607]
 [ 738]
 [ 784]
 [1226]
 [1144]
 [  44]
 [   1]
 [1042]
 [ 970]
 [  89]
 [ 525]
 [ 303]
 [   9]


In [13]:
# mean squared error
mse = (((predictions - actual) ** 2).sum()) / len(predictions)
print(mse)

4027.1783647798748
