# Libraries and Parse Data

In [71]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

def spread_df_rgn(normalize=False):
  data_df = pd.read_csv('combined_out.csv')
  data_df = data_df.drop('AwayTeam', axis=1)\
                   .drop('HomeTeam', axis=1)\
                   .drop('Date', axis=1)
  data_df = data_df.dropna(axis=0)

  real_cols = data_df.select_dtypes(include=['number']).columns

  if normalize:
    for col in real_cols:
      min_val = data_df[col].min()
      max_val = data_df[col].max()
      data_df[col] = (data_df[col] - min_val) / (max_val - min_val)

  data_np = data_df.to_numpy()
  data_np = data_np[:, 5:]

  data_np = data_np.astype(float)

  return data_np[:, 1:-1], data_np[:, :1]

def spread_df_cls(normalize=False):
  data_df = pd.read_csv('combined_out.csv')
  data_df = data_df.drop('AwayTeam', axis=1)\
                   .drop('HomeTeam', axis=1)\
                   .drop('Date', axis=1)
  data_df = data_df.dropna(axis=0)

  real_cols = data_df.select_dtypes(include=['number']).columns

  if normalize:
    for col in real_cols:
      min_val = data_df[col].min()
      max_val = data_df[col].max()
      data_df[col] = (data_df[col] - min_val) / (max_val - min_val)

  data_np = data_df.to_numpy()
  data_np = data_np[:, 5:]

  data_np = data_np.astype(float)

  return data_np[:, 1:-1], data_np[:, -1:]

(array([[ 1.42452423e+03,  1.68693788e+03,  2.39600000e+02, ...,
         1.86000000e+01, -1.70000000e+01,  5.60000000e+00],
       [ 1.56916316e+03,  1.47330176e+03,  2.40500000e+02, ...,
         2.49000000e+01, -2.40000000e+00, -5.50000000e+00],
       [ 1.58203631e+03,  1.54225772e+03,  2.39700000e+02, ...,
         2.95000000e+01,  1.10000000e+00, -1.31000000e+01],
       ...,
       [ 1.50086384e+03,  1.50258666e+03,  2.39100000e+02, ...,
         2.09000000e+01,  1.10000000e+00,  1.30000000e+00],
       [ 1.51366683e+03,  1.64375774e+03,  2.39300000e+02, ...,
         1.88000000e+01, -1.81000000e+01,  1.94000000e+01],
       [ 1.30508901e+03,  1.47892752e+03,  2.39400000e+02, ...,
         1.83000000e+01, -1.65000000e+01,  8.00000000e-01]]), array([[1.],
       [0.],
       [1.],
       ...,
       [1.],
       [0.],
       [0.]]))


# K-Neighbors Regressor
We are trying to calculate the true point differential of an NBA game.

## No Normalization and No Weighting

In [55]:
X, y = spread_df_rgn()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

knn = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)
acc_train = knn.score(X_train, y_train)
acc_test = knn.score(X_test, y_test)

mae_train = mean_absolute_error(knn.predict(X_train), y_train)
mae_test = mean_absolute_error(knn.predict(X_test), y_test)

## Normalization without Weighting

In [56]:
X, y = spread_df_rgn(normalize=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

knn = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)
acc_train = knn.score(X_train, y_train)
acc_test = knn.score(X_test, y_test)

mae_train = mean_absolute_error(knn.predict(X_train), y_train)
mae_test = mean_absolute_error(knn.predict(X_test), y_test)

## Normalization and Weighting

In [69]:
X, y = spread_df_rgn(normalize=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

knn = KNeighborsRegressor(n_neighbors=5, weights='distance').fit(X_train, y_train)
acc_train = knn.score(X_train, y_train)
acc_test = knn.score(X_test, y_test)

mae_train = mean_absolute_error(knn.predict(X_train), y_train)
mae_test = mean_absolute_error(knn.predict(X_test), y_test)

As k increases, MAE for test training set is around 0.056. This means that the correct score is only missed by that many points on average.
(TBD: add charts, but we are essentially in a state of completion.)

# K-Neighbors Classifier
We are trying to estimate the point differential and compare it against the spread.

## Without Normalization

In [77]:
X, y = spread_df_cls()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

knn = KNeighborsClassifier(p=3).fit(X_train, y_train)
acc_train = knn.score(X_train, y_train)
acc_test = knn.score(X_test, y_test)

print(f"Train {acc_train*100:.1f}%\nTest {acc_test*100:.1f}%")

  return self._fit(X, y)


Train 78.4%
Test 65.1%


## With Normalization

In [78]:
X, y = spread_df_cls(normalize=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

knn = KNeighborsClassifier(p=3).fit(X_train, y_train)
acc_train = knn.score(X_train, y_train)
acc_test = knn.score(X_test, y_test)

print(f"Train {acc_train*100:.1f}%\nTest {acc_test*100:.1f}%")

  return self._fit(X, y)
