## Import Dependencies 

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import lxml.html as lh
from sklearn.decomposition import PCA
from sklearn import preprocessing
%matplotlib inline
plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', None)
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from operator import itemgetter
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


# Get Data

List of NBA Teams

In [2]:
TEAMS = ['ATL', 'BOS' ,'CHA' ,'CHI', 'CLE', 'DAL' ,'DEN' ,'DET' ,'GS', 'HOU' ,'IND', 'LAC',
 'LAL', 'MEM', 'MIA', 'MIL' ,'MIN' ,'NJ', 'NY', 'ORL' ,'PHI' ,'PHX', 'POR', 'SAC',
 'SA', 'SEA', 'TOR' ,'UTAH' ,'WSH']


##Team Season Data

In [4]:
d = os.path.join("/content/team_season.txt")
team_season = np.loadtxt(d, delimiter=',', dtype='str')
# convert team_season into a csv file
# get header
header = team_season[0]
# remove first row, header
team_season = team_season[1:]
# convert it to an CSV file
pd.DataFrame(team_season).to_csv("team_stats.csv", header = header,index=False)

In [5]:
col_list = ["team", "year",'o_ftm','o_reb' ,'o_asts' ,'o_pf' ,'o_stl' ,'o_to' ,'o_blk', 'o_3pm','o_pts',
 'd_ftm' ,'d_reb', 'd_asts' ,'d_pf',
 'd_stl', 'd_to', 'd_blk','d_3pm','d_pts','won','lost']

team_stats = pd.read_csv("team_stats.csv",usecols=col_list)

#get team stats for 2001 
stats_df = team_stats.loc[team_stats["year"] == 2001]
stats_df.head()

Unnamed: 0,team,year,o_ftm,o_reb,o_asts,o_pf,o_stl,o_to,o_blk,o_3pm,o_pts,d_ftm,d_reb,d_asts,d_pf,d_stl,d_to,d_blk,d_3pm,d_pts,won,lost
1070,ATL,2001,1486,3400,1656,1702,667,1203,350,423,7711,1456,3525,1856,1647,703,1177,508,472,8058,33,49
1071,BOS,2001,1498,3461,1722,1775,793,1082,293,699,7901,1612,3760,1802,1733,623,1284,474,514,7720,49,33
1072,CHA,2001,1568,3564,1759,1747,653,1108,456,346,7700,1488,3362,1639,1702,625,1074,426,433,7621,44,38
1073,CHI,2001,1413,3283,1817,1834,633,1169,361,300,7335,1560,3497,2066,1716,681,1122,399,421,8035,21,61
1074,CLE,2001,1529,3451,1891,1752,572,1129,470,387,7812,1506,3310,1989,1638,602,1018,459,473,8085,29,53


In [91]:
TeamStats_2001 = np.array(stats_df)
#skip first 2 columns (team name and year)
team_stats = TeamStats_2001[:,2 :] 

## Game Data

In [136]:
col_list = ['home_team','home_points','away_team','away_points']

games = pd.read_csv("nba_games_2002_2003.csv",usecols=col_list)
games = games.loc[games["home_team"] != 'NO']
games.head()
games = games.loc[games["away_team"] != 'NO']
games.head()

Unnamed: 0,home_team,home_points,away_team,away_points
0,LAL,81,SA,87
1,ORL,95,PHI,88
2,SAC,94,CLE,67
3,BOS,96,CHI,99
5,DET,86,NY,77


In [137]:
# get outcome results from the game data ie. 0 if away_points are higher than home_points and 1 if home_points are higher than away_points
game_data =np.array(games)
game = []
for i in range(len(game_data)):
  x = []
  x.append(game_data[i][0])
  x.append(game_data[i][2])
  outcome = game_data[i][1]-game_data[i][3]
  if outcome < 0:
    x.append(0)
  else:
    x.append(1)
  game.append(x)
game_data = np.array(game)

# Create the Feature Vectors

In [149]:
def createDataSet(game_data,usePCA,isNormalized):
  pca = PCA(n_components=18)
  normalizer = preprocessing.MinMaxScaler()
  data = []
  for i in range(len(game_data)):

    
    # find location home_team stats
    index = TEAMS.index(game_data[i][0])
    # get home_team stats
    home_side = team_stats[index]
 
    # get away_team stats
    index = TEAMS.index(game_data[i][1])
    away_side =  team_stats[index]
    
  
    # create feature vector concatenate home_team stats and away_team stats
    feature_vector = np.concatenate((home_side, away_side), axis=None)

    data.append(feature_vector)

  #NORMALIZE
  if (isNormalized):
    data = normalizer.fit_transform(data)

  #reduce feature vector with PCA
  if (usePCA):
    data = pca.fit_transform(data)

  ndata = []

  for z in range(len(game_data)):
    result = []
    result.append(game_data[z][2])

    # create feature vector by appending the outcome labled result of the game  
    feature_vector = np.concatenate((data[z],result), axis=None)
    ndata.append(feature_vector)

  data = np.array(ndata)
  print("feature vector size", data.shape[1])
  
  return data

# ML Techniques

Scoring matrix 

In [139]:
def scores(model):
    
    model.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    
    print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
    print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
    print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
    print("F1: %.3f" % metrics.f1_score(ytest, y_pred))
    
    proba = model.predict_proba(xtest)
    print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

    pos_prob = proba[:, 1]
    print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, pos_prob))
    
    cv = cross_val_score(model, xtest, ytest, cv = 10, scoring = 'accuracy')
    print("Accuracy (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))

    print("confusion matrix")
    print( metrics.confusion_matrix(ytest, y_pred) )
    
    return y_pred

## Training and Testing our Models

Get the different Data Sets

In [None]:
#Original data
dA = createDataSet(game_data,False,False)
#PCA with Normalized Data
dB = createDataSet(game_data,True,True)
#PCA Data
dC = createDataSet(game_data,True,False)
#Normalized Data
dD = createDataSet(game_data,False,True)

**Results using Original Data (NO PCA or Normalization)**

In [152]:
trainA, testA = train_test_split( dA, test_size = 0.25,random_state = 20)

xtrain = trainA[:, :-1].astype(float)# for all but last column
ytrain = trainA[:, -1].astype(int) # for last column (result label )

xtest = testA[:, :-1].astype(float) 
ytest = testA[:, -1].astype(int) 

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))
print()

print("SVM STATS")
#svc = SVC(probability=True)
svc = SVC(kernel='rbf',C=100,probability=True)
y_svc = scores(svc)
print()

print("knn STATS")
knn = neighbors.KNeighborsClassifier(n_neighbors = 12, weights = 'uniform')
y_knn = scores(knn)
print()

print("logreg STATS")
logreg = LogisticRegression(max_iter=10000)
y_log = scores(logreg)

Training set size: 830
Testing set size: 277

SVM STATS
Accuracy score: 0.693
Recall: 0.921
Precision: 0.697
F1: 0.793
Log loss: 0.592
Area under ROC curve: 0.714
Accuracy (cross validation score): 0.650 (+/- 0.103)
confusion matrix
[[ 29  71]
 [ 14 163]]

knn STATS
Accuracy score: 0.708
Recall: 0.814
Precision: 0.750
F1: 0.780
Log loss: 0.808
Area under ROC curve: 0.718
Accuracy (cross validation score): 0.621 (+/- 0.154)
confusion matrix
[[ 52  48]
 [ 33 144]]

logreg STATS
Accuracy score: 0.668
Recall: 0.785
Precision: 0.720
F1: 0.751
Log loss: 0.602
Area under ROC curve: 0.689
Accuracy (cross validation score): 0.635 (+/- 0.177)
confusion matrix
[[ 46  54]
 [ 38 139]]


**Results using PCA Data**

In [154]:
trainA, testA = train_test_split( dC, test_size = 0.25,random_state = 20)

xtrain = trainA[:, :-1].astype(float)# for all but last column
ytrain = trainA[:, -1].astype(int) # for last column (result label )

xtest = testA[:, :-1].astype(float) 
ytest = testA[:, -1].astype(int) 



print("SVM STATS")
#svc = SVC(probability=True)
svc = SVC(kernel='rbf',probability=True)
y_svc = scores(svc)
print()

print("knn STATS")
knn = neighbors.KNeighborsClassifier(n_neighbors = 18, weights = 'uniform')
y_knn = scores(knn)
print()

print("logreg STATS")
logreg = LogisticRegression(max_iter=10000)
y_log = scores(logreg)

SVM STATS
Accuracy score: 0.704
Recall: 0.876
Precision: 0.721
F1: 0.791
Log loss: 0.591
Area under ROC curve: 0.711
Accuracy (cross validation score): 0.660 (+/- 0.119)
confusion matrix
[[ 40  60]
 [ 22 155]]

knn STATS
Accuracy score: 0.690
Recall: 0.831
Precision: 0.724
F1: 0.774
Log loss: 0.809
Area under ROC curve: 0.721
Accuracy (cross validation score): 0.642 (+/- 0.161)
confusion matrix
[[ 44  56]
 [ 30 147]]

logreg STATS
Accuracy score: 0.715
Recall: 0.898
Precision: 0.723
F1: 0.801
Log loss: 0.589
Area under ROC curve: 0.707
Accuracy (cross validation score): 0.690 (+/- 0.126)
confusion matrix
[[ 39  61]
 [ 18 159]]


**Results using Normalized Data**

In [155]:
trainA, testA = train_test_split( dD, test_size = 0.25,random_state = 20)

xtrain = trainA[:, :-1].astype(float)# for all but last column
ytrain = trainA[:, -1].astype(int) # for last column (result label )

xtest = testA[:, :-1].astype(float) 
ytest = testA[:, -1].astype(int) 


print("SVM STATS")
#svc = SVC(probability=True)
svc = SVC(kernel='rbf',C=100,probability=True)
y_svc = scores(svc)
print()

print("knn STATS")
knn = neighbors.KNeighborsClassifier()
y_knn = scores(knn)
print()

print("logreg STATS")
logreg = LogisticRegression(max_iter=10000)
y_log = scores(logreg)

SVM STATS
Accuracy score: 0.599
Recall: 0.695
Precision: 0.683
F1: 0.689
Log loss: 0.633
Area under ROC curve: 0.633
Accuracy (cross validation score): 0.563 (+/- 0.186)
confusion matrix
[[ 43  57]
 [ 54 123]]

knn STATS
Accuracy score: 0.679
Recall: 0.825
Precision: 0.716
F1: 0.766
Log loss: 1.893
Area under ROC curve: 0.670
Accuracy (cross validation score): 0.624 (+/- 0.152)
confusion matrix
[[ 42  58]
 [ 31 146]]

logreg STATS
Accuracy score: 0.704
Recall: 0.870
Precision: 0.723
F1: 0.790
Log loss: 0.592
Area under ROC curve: 0.702
Accuracy (cross validation score): 0.690 (+/- 0.140)
confusion matrix
[[ 41  59]
 [ 23 154]]


**Results using PCA with Normalized Data**

In [None]:
trainA, testA = train_test_split( dB, test_size = 0.25,random_state = 20)

xtrain = trainA[:, :-1].astype(float)# for all but last column
ytrain = trainA[:, -1].astype(int) # for last column (result label )

xtest = testA[:, :-1].astype(float) 
ytest = testA[:, -1].astype(int) 



print("SVM STATS")
#svc = SVC(probability=True)
svc = SVC(kernel='rbf',probability=True)
y_svc = scores(svc)
print()

print("knn STATS")
knn = neighbors.KNeighborsClassifier(n_neighbors = 12, weights = 'uniform')
y_knn = scores(knn)
print()

print("logreg STATS")
logreg = LogisticRegression(max_iter=10000)
y_log = scores(logreg)

SVM STATS
Accuracy score: 0.697
Recall: 0.864
Precision: 0.718
F1: 0.785
Log loss: 0.597
Area under ROC curve: 0.697
Accuracy (cross validation score): 0.621 (+/- 0.114)
confusion matrix
[[ 40  60]
 [ 24 153]]

knn STATS
Accuracy score: 0.661
Recall: 0.768
Precision: 0.720
F1: 0.743
Log loss: 0.943
Area under ROC curve: 0.698
Accuracy (cross validation score): 0.606 (+/- 0.145)
confusion matrix
[[ 47  53]
 [ 41 136]]

logreg STATS
Accuracy score: 0.675
Recall: 0.859
Precision: 0.700
F1: 0.772
Log loss: 0.596
Area under ROC curve: 0.698
Accuracy (cross validation score): 0.668 (+/- 0.126)
confusion matrix
[[ 35  65]
 [ 25 152]]
