In [None]:
import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn as sns

def f(row):
    if pandas.isna(row['POSTSEASON']) == True:
        val = 0
    else:
        val = 1
    return val

data = pandas.read_csv('cbb.csv')
data['TOURNEY'] = data.apply(f, axis=1)
data[numpy.logical_and(data['YEAR']==2017,data['TOURNEY']==1)].sort_values('W',ascending=False)

In [None]:
##### LOOK AT A SUBSET OF PLAYLISTS AND VIEW THEIR TRACK RESULTS #####
ax = plt.subplot(111)

play_x = data[data['YEAR']==2019]['3P_D']
play_y = data[data['YEAR']==2019]['WAB']
play_classes = data[data['YEAR']==2019]['TOURNEY']
    
play_scatter = sns.scatterplot(x=play_x, y=play_y, hue=play_classes, alpha=0.6)

box = ax.get_position()
ax.legend(loc='center left',bbox_to_anchor=(1,0.5))

plt.show()

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

#initial testing data
model_data = data

X = model_data[['W', 'ADJOE', 'ADJDE', 'EFG_O', 'EFG_D',
       'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', '3P_O',
       '3P_D', 'ADJ_T', 'WAB']].values
Y = model_data['TOURNEY'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#new testing data
new_model_data = data[data['YEAR']==2019]

new_X = new_model_data[['W', 'ADJOE', 'ADJDE', 'EFG_O', 'EFG_D',
       'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', '3P_O',
       '3P_D', 'ADJ_T', 'WAB']].values
new_Y = new_model_data['TOURNEY'].values

scaler.fit(new_X)

new_X_train = scaler.transform(new_X)
new_X_test = scaler.transform(new_X)


In [None]:
Lin_X_train, Lin_X_test, Lin_Y_train, Lin_Y_test = train_test_split(X, Y, test_size=0.20,random_state=42)
regressor = LinearRegression()
regressor.fit(Lin_X_train,Lin_Y_train)
coef_df = pandas.DataFrame(regressor.coef_)
coef_df.sort_values(0,ascending = False)

In [None]:
import random 
random.seed(42)
k_iterations = []

best_classifier = None
best_classifier_score = 0


for i in range(1,50):
    classifier = KNeighborsClassifier(n_neighbors=i)
    classifier.fit(X_train,Y_train)
    Y_pred = classifier.predict(X_test)
    if accuracy_score(Y_test, Y_pred) > best_classifier_score:
        best_classifier = classifier
        best_classifier_score = accuracy_score(Y_test, Y_pred)
    k_iterations.append(accuracy_score(Y_test, Y_pred))

k_iterations = pandas.DataFrame(k_iterations)
k_iterations.columns = ['score']
print(best_classifier_score)
k_iterations.sort_values('score',ascending=False)

#best score was k = 25


In [99]:
new_classifier = KNeighborsClassifier(n_neighbors=25)
new_knn_model = new_classifier.fit(X_train,Y_train)
new_Y_pred = new_classifier.predict(new_X_test)
new_Y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,