In [1]:
# Import necessary packages

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from operator import itemgetter
from sklearn.model_selection import cross_val_score

In [2]:
# Import the datasets

dfHistorical = pd.read_csv('historical-all-nba.csv')
dfCurrent = pd.read_csv('current-all-nba.csv')

In [3]:
dfHistorical.head()

Unnamed: 0,Player,Tm,G,Team Wins,Overall Seed,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,VORP,BPM,All-NBA,All-Star
0,George Gervin,SAS,78.0,41.0,10,37.6,33.1,5.2,2.6,1.4,1.0,0.528,0.314,0.852,10.6,0.173,3.1,2.2,1,1
1,Eddie Johnson,ATL,79.0,50.0,6,33.2,18.5,2.5,4.7,1.5,0.3,0.487,0.385,0.828,8.2,0.15,2.8,2.2,0,1
2,Moses Malone,HOU,82.0,41.0,9,38.3,25.8,14.5,1.8,1.0,1.3,0.502,0.0,0.719,11.9,0.183,3.8,2.8,1,1
3,Julius Erving,PHI,78.0,59.0,3,36.1,26.9,7.4,4.6,2.2,1.8,0.519,0.2,0.787,12.5,0.213,6.5,7.2,1,1
4,John Drew,ATL,80.0,50.0,6,28.8,19.5,5.9,1.3,1.1,0.3,0.453,0.0,0.757,6.4,0.134,0.8,-0.7,0,1


# Create models

In [4]:
train, test = train_test_split(dfHistorical, test_size = 0.25, random_state = 36)

xtrain = train[['Team Wins', 'Overall Seed', 'PTS', 'TRB', 'AST', 'VORP', 'WS', 'All-Star']]
ytrain = train[['All-NBA']]

xtest = test[['Team Wins', 'Overall Seed', 'PTS', 'TRB', 'AST', 'VORP', 'WS', 'All-Star']]
ytest = test[['All-NBA']]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))

Training set size: 717
Testing set size: 239


In [5]:
# Create function that gives accuracy scores for each model

def scores(model):
    
    model.fit(xtrain, ytrain.values.ravel())
    y_pred = model.predict(xtest)
    
    print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
    print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
    print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
    print("F1: %.3f" % metrics.f1_score(ytest, y_pred))
    
    proba = model.predict_proba(xtest)
    print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

    pos_prob = proba[:, 1]
    print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, pos_prob))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
    print("Accuracy (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    return y_pred

In [6]:
svc = SVC(kernel = 'rbf', gamma = 1e-3, C = 100, probability = True)

y_svc = scores(svc)

Accuracy score: 0.837
Recall: 0.850
Precision: 0.844
F1: 0.847
Log loss: 0.416
Area under ROC curve: 0.887
Accuracy (cross validation score): 0.766 (+/- 0.081)


In [7]:
rf = RandomForestClassifier(random_state = 999, n_estimators = 100, criterion = 'gini')

y_rf = scores(rf)

Accuracy score: 0.808
Recall: 0.819
Precision: 0.819
F1: 0.819
Log loss: 0.416
Area under ROC curve: 0.889
Accuracy (cross validation score): 0.766 (+/- 0.100)


In [8]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 12, weights = 'uniform')

y_knn = scores(knn)

Accuracy score: 0.803
Recall: 0.772
Precision: 0.845
F1: 0.807
Log loss: 0.403
Area under ROC curve: 0.897
Accuracy (cross validation score): 0.783 (+/- 0.075)


In [9]:
dnn = MLPClassifier(solver = 'lbfgs', hidden_layer_sizes = 100, random_state = 999, activation = 'relu')

y_dnn = scores(dnn)

Accuracy score: 0.808
Recall: 0.819
Precision: 0.819
F1: 0.819
Log loss: 0.430
Area under ROC curve: 0.885
Accuracy (cross validation score): 0.728 (+/- 0.027)


# Prediction

In [10]:
dfCurrentNames = dfCurrent.iloc[:, 0]
dfCurrentPredict = dfCurrent[['Team Wins', 'Overall Seed', 'PTS', 'TRB', 'AST', 'VORP', 'WS', 'All-Star']]

dfCurrent.head()

Unnamed: 0,Player,Tm,G,Team Wins,Overall Seed,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,VORP,BPM,All-Star
0,LaMarcus Aldridge,SAS,81,48,13,33.2,21.3,9.2,2.4,0.5,1.3,0.519,0.238,0.847,9.3,0.167,2.5,1.6,1
1,Karl-Anthony Towns,MIN,77,36,21,33.1,24.4,12.4,3.4,0.9,1.6,0.518,0.4,0.836,10.4,0.197,5.7,6.8,1
2,Klay Thompson,GSW,78,57,3,34.0,21.5,3.8,2.4,1.1,0.6,0.467,0.402,0.816,5.3,0.095,0.8,-0.8,1
3,Ben Simmons,PHI,79,51,7,34.2,16.9,8.8,7.7,1.4,0.8,0.563,0.0,0.6,8.2,0.146,4.1,4.1,1
4,Damian Lillard,POR,80,53,6,35.5,25.8,4.6,6.9,1.1,0.4,0.444,0.369,0.912,12.1,0.205,5.4,5.5,1


In [11]:
def make_pred(model):

    proba = model.predict_proba(dfCurrentPredict)
    pos_prob = proba[:, 1]
    
    combined_list = [[i, j] for i, j in zip(dfCurrentNames, pos_prob)]
    combined_list = sorted(combined_list, key = itemgetter(1), reverse = True)
    
    for i in combined_list:
        print(i)
        
    return pos_prob

In [12]:
svc_prob = make_pred(svc)

['James Harden', 0.9999999712764461]
['Giannis Antetokounmpo', 0.9999959635849688]
['Joel Embiid', 0.9846661772518505]
['Damian Lillard', 0.9845093212567015]
['Kevin Durant', 0.9784098861809469]
['Stephen Curry', 0.971435858447692]
['Paul George', 0.9650866119554697]
['Rudy Gobert', 0.947590300844725]
['Russell Westbrook', 0.9464892476640893]
['Nikola Jokic', 0.9447560620572224]
['Kawhi Leonard', 0.9148493492505126]
['Kyrie Irving', 0.8132831029129209]
['LeBron James', 0.7040294627617077]
['Nikola Vucevic', 0.5161954592406948]
['LaMarcus Aldridge', 0.4863984759479861]
['Anthony Davis', 0.482635681575305]
['Karl-Anthony Towns', 0.44416844454913923]
['Blake Griffin', 0.42054548685553395]
['Ben Simmons', 0.40064477630149276]
['Kemba Walker', 0.3672153524768562]
['Bradley Beal', 0.2636032575246083]
["D'Angelo Russell", 0.18643906460823045]
['Kyle Lowry', 0.1290951767382533]
['Klay Thompson', 0.12554481441770976]
['Khris Middleton', 0.11004472244026219]


In [13]:
rf_prob = make_pred(rf)

['Giannis Antetokounmpo', 1.0]
['James Harden', 0.99]
['Kevin Durant', 0.98]
['Paul George', 0.93]
['Damian Lillard', 0.92]
['Rudy Gobert', 0.88]
['Stephen Curry', 0.87]
['Nikola Jokic', 0.86]
['Kawhi Leonard', 0.77]
['Joel Embiid', 0.75]
['Russell Westbrook', 0.7]
['Kyrie Irving', 0.61]
['LeBron James', 0.6]
['Karl-Anthony Towns', 0.5]
['Nikola Vucevic', 0.45]
['Kemba Walker', 0.44]
['Blake Griffin', 0.42]
['LaMarcus Aldridge', 0.36]
['Anthony Davis', 0.34]
['Ben Simmons', 0.33]
['Bradley Beal', 0.29]
["D'Angelo Russell", 0.22]
['Klay Thompson', 0.2]
['Khris Middleton', 0.15]
['Kyle Lowry', 0.07]


In [14]:
knn_prob = make_pred(knn)

['James Harden', 1.0]
['Kevin Durant', 1.0]
['Giannis Antetokounmpo', 1.0]
['Paul George', 1.0]
['Stephen Curry', 1.0]
['Damian Lillard', 0.9166666666666666]
['Kawhi Leonard', 0.9166666666666666]
['Joel Embiid', 0.9166666666666666]
['Nikola Jokic', 0.8333333333333334]
['Russell Westbrook', 0.75]
['LaMarcus Aldridge', 0.6666666666666666]
['Kyrie Irving', 0.6666666666666666]
['Rudy Gobert', 0.5833333333333334]
['Anthony Davis', 0.5]
['Kemba Walker', 0.5]
['Karl-Anthony Towns', 0.4166666666666667]
['Ben Simmons', 0.4166666666666667]
['Blake Griffin', 0.4166666666666667]
['Nikola Vucevic', 0.3333333333333333]
['Kyle Lowry', 0.3333333333333333]
['Klay Thompson', 0.25]
['Khris Middleton', 0.25]
['LeBron James', 0.16666666666666666]
['Bradley Beal', 0.16666666666666666]
["D'Angelo Russell", 0.16666666666666666]


In [15]:
dnn_prob = make_pred(dnn)

['James Harden', 0.9998332309193404]
['Rudy Gobert', 0.9995252940215662]
['Giannis Antetokounmpo', 0.998646373677687]
['Kevin Durant', 0.9725301266589811]
['Damian Lillard', 0.9666298469151513]
['Paul George', 0.9539410943233264]
['Nikola Jokic', 0.9445866435208123]
['Joel Embiid', 0.9426192667649202]
['Stephen Curry', 0.9424047129514649]
['Kawhi Leonard', 0.9250944681456795]
['Russell Westbrook', 0.8422757805731262]
['Kyrie Irving', 0.7533462945659583]
['LeBron James', 0.7279419694084948]
['Blake Griffin', 0.4319273235065642]
['Ben Simmons', 0.43003995028206754]
['Nikola Vucevic', 0.40897429974091376]
['Kemba Walker', 0.3712629437182315]
['LaMarcus Aldridge', 0.2728607500340925]
['Anthony Davis', 0.21430241275190956]
['Khris Middleton', 0.19074759699002844]
['Karl-Anthony Towns', 0.1780705702410632]
['Kyle Lowry', 0.14935142955523745]
['Bradley Beal', 0.1489040897749628]
['Klay Thompson', 0.13015307629722225]
["D'Angelo Russell", 0.10294923514344725]


In [16]:
avg_prob = []

for i, j, k, l in zip(svc_prob, rf_prob, knn_prob, dnn_prob):
    avg_prob.append((i + j + k + l) / 4)
    
avg_list = [[i, j] for i, j in zip(dfCurrentNames, avg_prob)]
avg_list = sorted(avg_list, key = itemgetter(1), reverse = True)

for i in avg_list:
    print(i)

['Giannis Antetokounmpo', 0.999660584315664]
['James Harden', 0.9974583005489466]
['Kevin Durant', 0.982735003209982]
['Paul George', 0.9622569265696991]
['Damian Lillard', 0.9469514587096299]
['Stephen Curry', 0.9459601428497892]
['Joel Embiid', 0.8984880276708593]
['Nikola Jokic', 0.895669009727842]
['Kawhi Leonard', 0.8816526210157147]
['Rudy Gobert', 0.8526122320499061]
['Russell Westbrook', 0.8096912570593038]
['Kyrie Irving', 0.7108240160363865]
['LeBron James', 0.5496595247092173]
['LaMarcus Aldridge', 0.4464814731621863]
['Nikola Vucevic', 0.42712577307873545]
['Blake Griffin', 0.4222848692571912]
['Kemba Walker', 0.4196195740487719]
['Ben Simmons', 0.39433784831255675]
['Karl-Anthony Towns', 0.3847264203642173]
['Anthony Davis', 0.38423452358180366]
['Bradley Beal', 0.21729350349155943]
['Klay Thompson', 0.176424472678733]
['Khris Middleton', 0.17519807985757266]
['Kyle Lowry', 0.170444984906706]
["D'Angelo Russell", 0.16901374160458607]
