# Predicting NBA Success Using Machine Learning
## Created by: Emanuel Azcona
## Course: Introduction to Machine Learning & Senior Thesis

Import the required libraries for handling dataframes, arrays, and generating the dataframes.

In [1]:
from gen_data import gen_data

import numpy as np
import pandas as pd

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in(username='emanuelazcona', api_key='Xzr6euzNyeujz9dBqG2M')

from copy import deepcopy

from operator import itemgetter

from collections import OrderedDict

from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn import linear_model, preprocessing
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from statsmodels.stats.weightstats import ztest

## Create Cumulative Player Databases

First, we use our user-created function to develop two seperate dictionaries:

- A dictionary of players by NBA Season
- A dictionary of teams by NBA Season

Each dictionary entry (in both dictionaries) is a Pandas dataframe

In [2]:
# first year of NBA seasons we'd like to store
start = 1
# end year ....
end = 16

# returns two dictionaries:
# 1. player dictionary where all NBA players are available in a dataframe, 
#       each dict. key is the NBA season's start year, ex: 2001 = 1, 2003 = 3
# 2. team dictionary .....
players, teams = gen_data(start,end)

# store all the NBA years we used in a list
play_years = list( players.keys() )

## Create Dictionary of NBA Seasons with NBA teams

Next, we create a dictionary of NBA seasons with each key referencing a dictionary of NBA teams for that respective season. Each entry of the teams dictionary has a dataframe of players on that team, in that year essentially.

- NBA Season
    - Team
        - Players
        
Example:

- '15-16'
    - Oklahoma City Thunder (okl)
        - Total Team Stats.
        - Players & Player Stats.
            - durant,kevin
                - GP
                - FGM
                - etc..
            - westbrook,russell
                - GP
                - FGM
                - etc..
            - etc..

In [6]:
# create basketball team class which stores players for team and total team statistics
class basketBallTeam:
    def __init__(self, players):
        self.players = players
        totalStats = []

# function for computing the overall team statistics of a team
def computeTeamStats(currTeam, i):
    FGM = np.sum(currTeam[:,5])
    FGA = np.sum(currTeam[:,6])
    TM = np.sum(currTeam[:,7])
    TA = np.sum(currTeam[:,8])
    FTM = np.sum(currTeam[:,9])
    FTA = np.sum(currTeam[:,10])
    OR = np.sum(currTeam[:, 11])
    TR = np.sum(currTeam[:,12])
    AS = np.sum(currTeam[:,13])
    ST = np.sum(currTeam[:,14])
    TO = np.sum(currTeam[:,15])
    BK = np.sum(currTeam[:,16])
    PF = np.sum(currTeam[:,17])
    PTS = np.sum(currTeam[:,19])
    
    MVP = 0
    mvpArray = currTeam[:,23]
    if True in mvpArray:
        MVP = 1
        
    currTeamStats = [bool(np.array(teams[year])[i,21]), FGM\
                     ,float(FGM)/FGA, TM, float(TM)/TA, FTM\
                     ,float(FTM)/FTA, OR, float(OR)/TR\
                     ,AS, ST, TO, BK, PF, PTS, MVP]
    return currTeamStats

# create dictionary of NBA teams
columns = ['Playoff', 'FGM', 'FG%'\
            ,'3M', '3PT%', 'FTM'\
            ,'FT%', 'OR', 'REB%'\
            ,'AS', 'ST', 'TO'\
            ,'BK', 'PF', 'PTS', 'MVP']
nbaSeasons = {}
for year in play_years:
    teamsDict = {}
    for i, team in enumerate(list( teams[year]['key'] )):
        currTeam = []
        currPlaySeason = np.array( players[year] )
        for j in range(currPlaySeason.shape[0]):
            if(currPlaySeason[j,1] == team):
                currTeam.append(currPlaySeason[j,:])
        currTeam = np.array(currTeam)
        teamsDict[team] = basketBallTeam(pd.DataFrame(currTeam\
                                                      , columns = players[play_years[0]].columns.tolist()))
        
        currTeamStats = computeTeamStats(currTeam, i)
        
        teamsDict[team].totalStats = pd.DataFrame(np.array([currTeamStats])\
                                                  , columns = columns)

    nbaSeasons[year] = teamsDict

## Create Pandas DataFrames of All NBA Teams & All NBA Players

Parsing through the nbaSeasons dictionary, we create a dataframe of every NBA player that's ever played in any NBA season. The same process is done for the teams as well.

In [4]:
# boolean flag to indicate not to vertically stack but instead initialize a NumPy array
flag = False
for year in play_years:
    for team in list( teams[year]['key'] ):
        if(not flag):
            flag = True
            allTeamHist = np.array( nbaSeasons[year][team].totalStats )
            allPlayHist = np.array( nbaSeasons[year][team].players )
            continue
        allTeamHist = np.vstack( (allTeamHist, np.array(nbaSeasons[year][team].totalStats) ) )
        allPlayHist = np.vstack( (allPlayHist, np.array(nbaSeasons[year][team].players) ) )

columnsTeam = columns
allTeamHist = pd.DataFrame(allTeamHist, columns = columnsTeam)
allTeamHist = allTeamHist.dropna()
allPlayHist = pd.DataFrame(allPlayHist, columns = players[play_years[0]].columns.tolist() )
allPlayHist = allPlayHist.dropna()

## Analyze NBA Team Data
### Task I: Overall NBA Team Statistics

Let's take a deeper look into the NBA Team data and analyze relationships across seasons. First, we compile a dataframe of total integer stats and average fractional stats for each individual NBA team's franchise history.

In [5]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamKeys = np.transpose( np.array( [teams[year]['key']] ) )
        continue
    teamKeys = np.vstack( (teamKeys, np.transpose( np.array( [teams[year]['key']] ) ) ) )

# create a deep copy of allTeamHist in order to not manipulate or lose important data in allTeamHist
X = np.array(deepcopy(allTeamHist)) 
X = pd.DataFrame(X, columns = columns, index = teamKeys.ravel())
del X['Playoff']
del X['MVP']

In [6]:
# function for computing overall sum of stats for each team (not by season but for entire NBA track starting in 2001)
def computeOverallTeamStats(X, team, statNames):
    overTeamStats = []
    for stat in statNames:
        overTeamStats.append( X[X.index == team][stat].sum(axis = 0) )
    return np.array(overTeamStats)

allStatNames = columnsTeam[1:-1]

teamsAlreadyChecked = []
flag = False
for team in list(X.index):
    if team in teamsAlreadyChecked:
        continue
    else:
        teamsAlreadyChecked.append(team)
        if not flag:
            flag = True
            overTeamStats = computeOverallTeamStats(X, team, allStatNames)
            continue
        currTeamStats = computeOverallTeamStats(X, team, allStatNames)
        overTeamStats = np.vstack( (overTeamStats, currTeamStats) )

overTeamStatsDF = pd.DataFrame(overTeamStats, columns = columnsTeam[1:-1], index = teamsAlreadyChecked)

Next, we obtain the plot of all the integer stats to analyze the team's total sum of franchise stats. Using the Python library, plotly, we obtain an organized (and visually asthetic), multi-bar graph.

- Integer Stats
    - Field Goals Made (FGM)
    - Three Pointers Made (3M)
    - Free Throws Made (FTM)
    - Offensive Rebounds (OR)
    - Assists (AS)
    - Steals (ST)
    - Turnovers (TO)
    - Blocks (BK)
    - Personal Fouls (PF)
    - Points Made (PTS)

In [7]:
# obtain integer stat labels
intStats = overTeamStatsDF.columns.tolist()
removeFields = ['FG%', '3PT%', 'FT%', 'REB%']
for field in removeFields:
    intStats.remove(field)

# extract all integer data and create multicolumn barcharts of overall integer stats
data = []
for team in list(overTeamStatsDF.index):
    teamStats = np.array(overTeamStatsDF[overTeamStatsDF.index == team]).ravel().tolist()
    for fracLoc in [1, 2, 3, 4]:
        teamStats.remove(teamStats[fracLoc])

    trace = go.Bar(
        x = intStats,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Overall Total Sum of Team Integer Statistics (2001-2016)',
    xaxis = dict(
        title = 'Integer Statistics'
    ),
    yaxis = dict(
        title = 'Integer Score'
    )
)

fig = go.Figure(data = data, layout = layout)
fig.layout.width=1500 #width in pixels
fig.layout.height = 800
py.iplot(fig, filename = 'overall team integer features')

Afterwards, we plot the fractional stats the same manner we did the integer stats.

- Fractional Stats
    - Field Goal Percentage (FG%)
    - Three Point Percentage (3PT%)
    - Free Throw Percentage (FT%)
    - Offensive Rebound Perentage (REB%)

In [8]:
# obtain fractional stats (same as integer stats process)
fracStats = overTeamStatsDF.columns.tolist()
removeFields = ['FGM', '3M', 'FTM', 'OR', 'AS', 'ST', 'TO', 'BK', 'PF', 'PTS']
for field in removeFields:
    fracStats.remove(field)

# repeat for fractional stats
data = []
for team in list(overTeamStatsDF.index):
    teamStats = itemgetter(*[1,3,5,7])(np.array(
        overTeamStatsDF[overTeamStatsDF.index == team]).ravel().tolist())

    trace = go.Bar(
        x = fracStats,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Overall Average of Team Fraction Statistics (2001-2016)',
    xaxis = dict(
        title = 'Fractional Statistics'
    ),
    yaxis = dict(
        title = 'Percentage Score'
    )
)

fig = go.Figure(data = data, layout = layout)
fig.layout.width=1500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'overall team fractional features')

## Analyze NBA Team Data
### Task II: Analyze Individual NBA Team Statistics By Year
For our next analysis, we want to observe the behavior of NBA teams through the different seasons in our dataset (2001-2002 through 2015-2016). First, we add the New Orleans teams to the first three seasons of our NBA dataset, with stats. of 0 across the board for all three years, since the New Orleans teams were not introduced until the 2004-2005 NBA Season.

In [9]:
# create deepcopy of X in order to not damage data
Xarr = deepcopy(X)

# convert to NumPy array
Xarr = np.array(Xarr)

# add NOrleans team with stats of 0 across the board for the first three seasons
for i in range(3):
    Xarr = np.vstack((np.array([0 for i in range(Xarr.shape[1])]), Xarr ))

# reconvert back to dataframe with newly added NOrleans team stats
XarrDF = pd.DataFrame(Xarr, columns = X.columns.tolist(), index = ['nor']*3 + X.index.tolist())

#### Sub-Task I: Team Field Goals per Season
Next, we construct our bar graph to analyze the performance of all NBA teams throughout each of the NBA seasons in our dataset. The first feature we analyze is the numebr of field goals a team makes in each NBA season.

In [10]:
# extract team years list where each entry is a string in the form of: 2001-2002
teamYears = []
for year in play_years:
    teamYears.append(str(year + 2000) + '-' + str(year + 2001))

In [11]:
# Plot field goals

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['FGM']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Field Goals Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Number of Field Goals Made in Season'
    )
)
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 400
py.iplot(fig, filename = 'field goals made per season')

#### Sub-Task II: Team Three Pointers per Season
The folllowing feature we analyze is the number fo three-pointers a team scores in an season.

In [12]:
# Plot threes made

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['3M']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Threes Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Threes Made in a Season'
    )
)
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 550
py.iplot(fig, filename = 'threes made per season')

#### Sub-Task III: Team Assists per Season
Afterwards, we look closely at the number of assists per NBA season for each NBA team.

In [13]:
# Plot assists made per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['AS']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Assists per Teams per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Assists in a Season'
    )
)
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 550
py.iplot(fig, filename = 'assists per season')

#### Sub-Task IV: Points Scored per Season
The next feature we analyze is the number of points a team scores in each NBA season.

In [14]:
# Plot points scored per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['PTS']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Points Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Points Made in a Season'
    )
)
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 550
py.iplot(fig, filename = 'points made per season')

#### Sub-Task V: Team Free-Throws Made per Season
The next feature we analyze is the number of free-throws a team scores in each NBA season.

In [15]:
# Plot free throws made per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['FTM']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Free-Throws Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Free-Throws Made in a Season'
    )
)
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 550
py.iplot(fig, filename = 'free throws made per season')

#### Sub-Task VI: Team Steals per Season
The next feature we analyze is the number of steals teams commit per season.

In [16]:
# Plot steals made per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['ST']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Steals Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Steals Made in a Season'
    )
)
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 550
py.iplot(fig, filename = 'steals made per season')

# Predicting NBA Playoff Teams
## Sub-process I: Determining the Statistical Significance of Team Stats to Playoff Status
For the first prediction, we will focus on predicting a whether an NBA team is cable of becoming a Playoff contender.

In this initial step, we focus on determining the statistical significance of features to NBA-Playoff contention. First, we look for the index of every NBA team that has been in the NBA-Playoffs during their respective season and those that weren't and store these indices in seperate NumPy arrays.

In [17]:
# record the indices of playoff teams, and non playoff teams

Iplay = np.where( np.array(allTeamHist['Playoff']) == True )[0]
Inon = np.where( np.array(allTeamHist['Playoff']) == False )[0]

Next, we create a deep copy of the allTeamHist dataframe, but remove irrelevant features such as:
- Playoff Indication (Playoff)

because this is what we're trying to predict. In other words, we remove the target variable.

In [18]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamNames = np.transpose( np.array( [teams[year]['team']] ) )
        continue
    teamNames = np.vstack( (teamNames, np.transpose( np.array( [teams[year]['team']] ) ) ) )
X = np.hstack( ( teamNames, np.array(deepcopy(allTeamHist)) ) )
X = pd.DataFrame(X, columns = ["Team"] + columns)

In [19]:
# only plot the first 6 rows of teams to save computation and plotting time (takes very long to plot ALL teams)

allNBATeamsTable = FF.create_table(X.head(6))
allNBATeamsTable.layout.width=2500 #width in pixels
allNBATeamsTable.layout.margin.update({'t':75, 'l':50})
allNBATeamsTable.layout.update({'title': 'All NBA Team Statistics (First 6)'})
py.iplot(allNBATeamsTable, filename = 'All NBA Team Stats')

In [20]:
del X['Team']
del X['Playoff']

Then, we import the ztest function from the Python statsmodel.stats.weightstats module to perform a Z-test on our features. We do this to determine the significance of our features in the determination of the NBA-Playoff contenders.

In [21]:
# compute Z-Test on all features and compute statistical significance of each feature

statDF = []
for label in X.columns.tolist():
    play = np.array(X[label])[Iplay]
    non = np.array(X[label])[Inon]
    stat, pval = ztest(play, non, 0, 'larger')
    statDF.append( [label, stat, pval] )
    
statDF = pd.DataFrame(statDF, columns = ['Feature', 'statistic', 'p-val'])
statDF = statDF.sort_values (['p-val'])

In [22]:
# tabulate results of Z-Test
allTeamsZtestTable = FF.create_table(statDF)
allTeamsZtestTable.layout.width = 600
allTeamsZtestTable.layout.margin.update({'t':75, 'l':50})
allTeamsZtestTable.layout.update({'title': 'Features Statistical Significance'})
py.iplot(allTeamsZtestTable, filename = 'Feat Stat Sig')

Our analysis shows that features such as:
- Offensive Rebounding Percentage (REB%)
- Personal Fouls (PF)
- Turnovers (TO)
- Offensive Rebounds (OR)
- Free Throw Percentage (FT%)

do not have a real significance in the determination of the NBA-Playoff contention. Therefore, we remove these features from our feature matrix since they probabilistically reject our null hypothesis.

In [23]:
del X['REB%']
del X['PF']
del X['TO']
del X['OR']
del X['FT%']

## Sub-process II: Developing Training Data
Now, we prepare a vector of boolean values indicating whether a team was an NBA-Playoff contender or not. We also cast the dataframe of predictors we have to a NumPy array type.

In [24]:
y = list( deepcopy(allTeamHist['Playoff']) )
Xs = preprocessing.scale(np.array(X))

nsamp = Xs.shape[0]
ntr = int(0.7*nsamp)
nts = nsamp - ntr

Xtr = Xs[:ntr,:]
ytr = y[:ntr]

Xts = Xs[ntr:,:]
yts = y[ntr:]


Data with input dtype object was converted to float64 by the scale function.



## Sub-process III: Logistic Regression Classifier
### Task I: Train Classifier on Entire Dataset
Next, we develop a logistic classifier model using sklearn to predict NBA Playoff Contenders. First we train our logistic regression classifier on all of the NBA teams that have ever existed. Using this, we test out a range of regularization strengths, $C$, to determine what the best one is.

In [25]:
Cvals = np.logspace(-2,6,15)

# initialize list of error rates for each regularization strength, C
err_l1 = []
err_l2 = []
err = []

# initialize minimum error rate of 100%
l1_err_min = 1
l2_err_min = 1
err_min = 1

# parse through C values, train the logistic classifier for each C value, and compute the error rate of the model
# for the corresponding C value
for C in Cvals:
    
    # create instance of logistic classifier (l1-penalized, l2-penalized, and unregularized)
    logregl1 = linear_model.LogisticRegression(penalty = 'l1', C = C)
    logregl2 = linear_model.LogisticRegression(penalty = 'l2', C = C)
    logreg = linear_model.LogisticRegression(C = C)
    
    # train model with training data
    logregl1.fit(Xtr, ytr)
    logregl2.fit(Xtr, ytr)
    logreg.fit(Xtr, ytr)
    
    # make predictions with each model
    y_hat_l1 = logregl1.predict(Xtr)
    y_hat_l2 = logregl2.predict(Xtr)
    y_hat = logreg.predict(Xtr)
    
    # calculate error rate of the model's prediction and ground truth values
    err_i_l1 = np.mean(y_hat_l1 != ytr)
    err_i_l2 = np.mean(y_hat_l2 != ytr)
    err_i = np.mean(y_hat != ytr)
    
    # append error rates
    err_l1.append(err_i_l1)
    err_l2.append(err_i_l2)
    err.append(err_i)
    
    # keep track of C value that gives the smallest error across all three models
    if (l1_err_min > err_i_l1) and (l2_err_min > err_i_l2) and (err_min > err_i):
        l1_err_min = err_i_l1
        l2_err_min = err_i_l2
        err_min = err_i
        C_min = C
    
    print("C={0:12.4e} \t l1-err = {1:10.4e} \t l2-err = {2:10.4e} \t err = {3:10.4e}".format(C\
        , err_i_l1, err_i_l2, err_i))

C=  1.0000e-02 	 l1-err = 5.4167e-01 	 l2-err = 2.9808e-01 	 err = 2.9808e-01
C=  3.7276e-02 	 l1-err = 2.9808e-01 	 l2-err = 2.7244e-01 	 err = 2.7244e-01
C=  1.3895e-01 	 l1-err = 2.6282e-01 	 l2-err = 2.7244e-01 	 err = 2.7244e-01
C=  5.1795e-01 	 l1-err = 2.5962e-01 	 l2-err = 2.6603e-01 	 err = 2.6603e-01
C=  1.9307e+00 	 l1-err = 2.7564e-01 	 l2-err = 2.7244e-01 	 err = 2.7244e-01
C=  7.1969e+00 	 l1-err = 2.7244e-01 	 l2-err = 2.7244e-01 	 err = 2.7244e-01
C=  2.6827e+01 	 l1-err = 2.6923e-01 	 l2-err = 2.6923e-01 	 err = 2.6923e-01
C=  1.0000e+02 	 l1-err = 2.6923e-01 	 l2-err = 2.6923e-01 	 err = 2.6923e-01
C=  3.7276e+02 	 l1-err = 2.6923e-01 	 l2-err = 2.6923e-01 	 err = 2.6923e-01
C=  1.3895e+03 	 l1-err = 2.6923e-01 	 l2-err = 2.6923e-01 	 err = 2.6923e-01
C=  5.1795e+03 	 l1-err = 2.6923e-01 	 l2-err = 2.6923e-01 	 err = 2.6923e-01
C=  1.9307e+04 	 l1-err = 2.6923e-01 	 l2-err = 2.6923e-01 	 err = 2.6923e-01
C=  7.1969e+04 	 l1-err = 2.6923e-01 	 l2-err = 2.6923e-01 	 err

Based on our analysis, we keep the regularization strength, $C$, which minimizes the error in predicting the NBA-Playoff contenders for all three of our models.

Now, we test our trained model to predict NBA-Playoff contenders in each NBA season.

## Sub-process III: Logistic Regression Classifier
### Task II: Use Entire-Trained-Classifier to Predict Playoffs in Each NBA Season
Using the regularization strength, $C$, with the least error in predicting the NBA-Playoff contender, we now use our traiend classifier to predict Playoff teams in each individual NBA Season, by iterating through each season.

In [7]:
# Use logistic classifier that predicts with minimal error to predict playoff contenders per season, with
# training data from all the previous seasons before the testing set

logregl1 = linear_model.LogisticRegression(penalty = 'l1', C = C_min)
logregl2 = linear_model.LogisticRegression(penalty = 'l2', C = C_min)
logreg = linear_model.LogisticRegression(C = C_min)

logregl1.fit(Xtr, ytr)
logregl2.fit(Xtr, ytr)
logreg.fit(Xtr, ytr)

acc_l1 = []
acc_l2 = []
acc = []

for year in play_years:
    currSeason = nbaSeasons[year]
    
    flag = False
    for team in teams[year]['key']:
        if(not flag):
            flag = True
            teamArr = np.array(currSeason[team].totalStats)
            continue
        teamArr = np.vstack((teamArr,np.array(currSeason[team].totalStats)))

    X = pd.DataFrame(teamArr, columns = columns)
    y = list(deepcopy(X['Playoff']))
    del_labels = ['REB%', 'PF', 'TO', 'OR', 'FT%', 'Playoff']
    for label in del_labels:
        del X[label]
    
    Xs = preprocessing.scale(X)
    
    y_hat_l1 = logregl1.predict_proba(Xs)
    y_hat_l2 = logregl2.predict_proba(Xs)
    y_hat = logreg.predict_proba(Xs)
    
    westPlayoffTeams = np.array(teams[year]['team'])[np.where(np.array(y) == True and teams[year]['Conference'] == 'West')]
    eastPlayoffTeams = np.array(teams[year]['team'])[np.where(np.array(y) == True and teams[year]['Conference'] == 'East')]
    
    sortedPlayPred_l1 = np.argsort(y_hat_l1[:,1])
    predictedTeams_l1 = np.array(teams[year]['team'])[sortedPlayPred_l1[-16:]]
    
    sortedPlayPred_l2 = np.argsort(y_hat_l2[:,1])
    predictedTeams_l2 = np.array(teams[year]['team'])[sortedPlayPred_l2[-16:]]
    
    sortedPlayPred = np.argsort(y_hat[:,1])
    predictedTeams = np.array(teams[year]['team'])[sortedPlayPred[-16:]]
    
    corr_l1 = 0
    corr_l2 = 0
    corr = 0
    i = 0
    
    for predict_l1, predict_l2, predict in zip(predictedTeams_l1, predictedTeams_l2, predictedTeams):
        i += 1
        if predict_l1 in westPlayoffTeams or predict_l1 in eastPlayoffTeams:
            corr_l1 += 1
        
        if predict_l2 in westPlayoffTeams or predict_l2 in eastPlayoffTeams:
            corr_l2 += 1
        
        if predict in westPlayoffTeams or predict in eastPlayoffTeams:
            corr += 1
    
    acc_i_l1 = float(corr_l1)/i
    acc_i_l2 = float(corr_l2)/i
    acc_i = float(corr)/i
    
    acc_l1.append(acc_i_l1)
    acc_l2.append(acc_i_l2)
    acc.append(acc_i)

NameError: name 'C_min' is not defined

Using, our predictions and ground truth data, we tabulate the accuracy of our entire-dataset-trained classifier for predicting the NBA-Playoff teams in each season.

In [27]:
# create NumPy array of team years and accuracies in order to horizontally stack them
playYearsString = np.transpose( np.array([teamYears]) )

acc_l1 = np.transpose( np.array([acc_l1]) )
acc_l2 = np.transpose( np.array([acc_l2]) )
acc = np.transpose( np.array([acc]) )

In [28]:
totAccTable = FF.create_table( pd.DataFrame( np.hstack((playYearsString, acc_l1, acc_l2, acc))\
                                            , columns = ['NBA Season'
                                                         , 'l1-Predict Acc.'\
                                                         , 'l2-Predict Acc.'\
                                                         , 'Un-normalized Predict Acc.']) )

totAccTable.layout.width = 800
totAccTable.layout.margin.update({'t':75, 'l':50})
totAccTable.layout.update({'title': 'Accuracy of Total-Trained Model Over Each NBA Season'})
py.iplot(totAccTable, filename = 'totAccTable')

## Sub-process III: Logistic Regression Classifier
### Focus III: Create Dictionary of NBA Team Stats by Season
Next, we create a dictionary of NBA teams by the season, with each entry being the a dataframe of seasonal stats for a team.

In [29]:
# create ordered dictionary of NBA stats by season
teamsDict = OrderedDict()
for year in play_years:
    flag = False
    seasonTeams = []
    for team in teams[year]['key']:
        if not flag:
            flag = True
            seasonTeams = np.array(nbaSeasons[year][team].totalStats).ravel()
            continue
        seasonTeams = np.vstack( (seasonTeams, np.array(nbaSeasons[year][team].totalStats).ravel()) )
    
    seasonTeams = pd.DataFrame( seasonTeams, columns = columnsTeam, index = teams[year]['key'].ravel().tolist() )
    
    # delete irrelevant features as described by the Z-Test
    del seasonTeams['REB%']
    del seasonTeams['PF']
    del seasonTeams['TO']
    del seasonTeams['OR']
    del seasonTeams['FT%']
    teamsDict[year] = seasonTeams

## Sub-process III: Logistic Regression Classifier
### Focus IV: Train Classifier With All Years of NBA Team Data Prior to Each Season
In this focus area, we create two lists:
- Training Data List
- Testing Data List

The list organization is described by the following example:
Ex:
- Testing on 2013-2014 NBA Season
    - Training Data contains all team information from 2001-2013
    - Testing Data contains all team information for 2013-2014

In [30]:
def compileTrainData(teamsDict, year):
    flag = False
    for i in range(1,year+1):
        if not flag:
            flag = True
            Xtr = np.array(teamsDict[i])
            continue
        Xtr = np.vstack( (Xtr, np.array(teamsDict[i]) ) )
    return Xtr
    
trainingFeatureSets = []
trainingTargetSets = []
for year in play_years[:-1]:
    Xtr = compileTrainData(teamsDict, year)
    ytr = deepcopy(Xtr[:,0])
    Xtr = Xtr[:,1:]
    
    trainingFeatureSets.append(preprocessing.scale(Xtr))
    trainingTargetSets.append(ytr)

## Sub-process III: Logistic Regression Classifier
### Focus V: Predict NBA Playoff Contenders in Each Season & Determine Accuracy
Next, we use our trained model to predict NBA-Playoff contenders in each NBA season and compute the accuracy of our prediction for each season. Afterwards, we tabulate our results to analyze the relationship between progressively training the model with more data each season and predicting NBA-Playoff contenders. 

In [31]:
accl1 = []
accl2 = []
acc = []

for i, year in enumerate(play_years[1:]):
    Xts = np.array(teamsDict[year])
    yts = deepcopy(Xts[:,0])
    Xts =  preprocessing.scale(Xts[:,1:])
    
    logregl1 = linear_model.LogisticRegression(penalty = 'l1', C = C_min)
    logregl2 = linear_model.LogisticRegression(penalty = 'l2', C = C_min)
    logreg = linear_model.LogisticRegression(C = C_min)

    logregl1.fit( trainingFeatureSets[i], trainingTargetSets[i] )
    logregl2.fit( trainingFeatureSets[i], trainingTargetSets[i] )
    logreg.fit( trainingFeatureSets[i], trainingTargetSets[i] )
    
    yts_hat_l1 = logregl1.predict_proba(Xts)
    yts_hat_l2 = logregl2.predict_proba(Xts)
    yts_hat = logreg.predict_proba(Xts)
    
    playoffTeams = np.array(teams[year]['team'])[np.where(np.array(yts) == True)]
    
    sortedPlayPred_l1 = np.argsort(yts_hat_l1[:,1])
    predictedTeams_l1 = np.array(teams[year]['team'])[sortedPlayPred_l1[-16:]]
    
    sortedPlayPred_l2 = np.argsort(yts_hat_l2[:,1])
    predictedTeams_l2 = np.array(teams[year]['team'])[sortedPlayPred_l2[-16:]]
    
    sortedPlayPred = np.argsort(yts_hat[:,1])
    predictedTeams = np.array(teams[year]['team'])[sortedPlayPred[-16:]]
    
    i = 0
    
    corr_l1 = 0
    corr_l2 = 0
    corr = 0
    
    
    
    for predict_l1, predict_l2, predict in zip(predictedTeams_l1, predictedTeams_l2, predictedTeams):
        i += 1
        if predict_l1 in playoffTeams:
            corr_l1 += 1
        
        if predict_l2 in playoffTeams:
            corr_l2 += 1
        
        if predict in playoffTeams:
            corr += 1
    
    acc_i_l1 = float(corr_l1)/i
    acc_i_l2 = float(corr_l2)/i
    acc_i = float(corr)/i

    accl1.append(acc_i_l1)
    accl2.append(acc_i_l2)
    acc.append(acc_i)

In [32]:
trainYears = []
testYears = []

for curr in play_years[1:]:
    trainYears.append( str(2001) + '-' + str(2000 + curr) )
    testYears.append( str(2000 + curr) + '-' + str(2001 + curr) )

trainYears = np.transpose(np.array([trainYears]))
testYears = np.transpose(np.array([testYears]))

accl1 = np.transpose( np.array([accl1]) )
accl2 = np.transpose( np.array([accl2]) )
acc = np.transpose( np.array([acc]) )

totAccTable = FF.create_table( pd.DataFrame( np.hstack((trainYears, testYears, accl1, accl2, acc))\
                                            , columns = ['Train Years', 'Test NBA Season'\
                                                         , 'l1-Predict Acc.'\
                                                         , 'l2-Predict Acc.'\
                                                         , 'Un-normalized Predict Acc.']) )

totAccTable.layout.width = 1000
totAccTable.layout.margin.update({'t':75, 'l':50})
totAccTable.layout.update({'title': 'Accuracy of Progressively-Trained Model Over Each NBA Season'})
py.iplot(totAccTable, filename = 'progAccTable')

## Sub-process IV: C-Support Vector Classification
### Focus I: Reformat Entire Dataset for Total Training

After our analysis of the logistic regression classifier, we move forward to analyzing the accuracy of classifications using support vectore machines and c-support vector classification.

First, we reformat the Entire Dataset for total model training across the dataset to analyze the best regularization strength, $C$.

In [8]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamNames = np.transpose( np.array( [teams[year]['team']] ) )
        continue
    teamNames = np.vstack( (teamNames, np.transpose( np.array( [teams[year]['team']] ) ) ) )
X = np.hstack( ( teamNames, np.array(deepcopy(allTeamHist)) ) )
X = pd.DataFrame(X, columns = ["Team"] + columns)

del X['Team']
del X['Playoff']
del X['REB%']
del X['PF']
del X['TO']
del X['OR']
del X['FT%']

y = list( deepcopy(allTeamHist['Playoff']) )
Xs = preprocessing.scale(np.array(X))

nsamp = Xs.shape[0]
ntr = int(0.8*nsamp)
nts = nsamp - ntr

Xtr = Xs[:ntr,:]
ytr = y[:ntr]

Xts = Xs[ntr:,:]
yts = y[ntr:]

NameError: name 'allTeamHist' is not defined

## Sub-process IV: C-Support Vector Classification
### Focus II: Intialize SVM Model Instance and Test Seperate Regularization Strengths on Entire Datatset Prediction
Afterwards, we intialize an support vector machine model instance and test our classifier across several regularization strengths, $C$.

In [34]:
Cvals = np.logspace(-2,5,10)

err_lin = []
err_rbf = []
err_poly = []
err_sig = []

lin_err_min = 1
rbf_err_min = 1
poly_err_min = 1
sig_err_min = 1

flag = False
for C in Cvals:
    if not flag:
        flag = True
        C_min = C
        
    svm_lin = svm.SVC(kernel = 'linear', C = C)
    svm_rbf = svm.SVC(kernel = 'rbf', C = C)
    svm_poly = svm.SVC(kernel = 'poly', C = C)
    svm_sig = svm.SVC(kernel = 'sigmoid', C = C)
    
    svm_lin.fit(Xs, y)
    svm_rbf.fit(Xs, y)
    svm_poly.fit(Xs, y)
    svm_sig.fit(Xs, y)
    
    y_hat_lin = svm_lin.predict(Xtr)
    y_hat_rbf = svm_rbf.predict(Xtr)
    y_hat_poly = svm_poly.predict(Xtr)
    y_hat_sig = svm_sig.predict(Xtr)
    
    err_i_lin = np.mean(y_hat_lin != ytr)
    err_i_rbf = np.mean(y_hat_rbf != ytr)
    err_i_poly = np.mean(y_hat_poly != ytr)
    err_i_sig = np.mean(y_hat_sig != ytr)
    
    err_lin.append(err_i_lin)
    err_rbf.append(err_i_rbf)
    err_poly.append(err_i_poly)
    err_sig.append(err_i_sig)
    
    if (lin_err_min > err_i_lin) or \
    (rbf_err_min > err_i_rbf) or \
    (poly_err_min > err_i_poly) or (sig_err_min > err_i_sig):
        lin_err_min = err_i_lin
        rbf_err_min = err_i_rbf
        poly_err_min = err_i_poly
        sig_err_min = err_i_sig
        C_min = C

## Sub-process IV: C-Support Vector Classification
### Focus III: Tabulate & Analyze Entire Dataset Training Errors
Next, we tabulate our results and analyze the errors based on the regualrization strengths on each of our SVM classifier models.

In [35]:
CvalsArr = np.transpose( np.array([Cvals]))

err_linArr = np.transpose( np.array([err_lin]) )
err_rbfArr = np.transpose( np.array([err_rbf]) )
err_polyArr = np.transpose( np.array([err_poly]) )
err_sigArr = np.transpose( np.array([err_sig]) )

totAccTable = FF.create_table( pd.DataFrame( np.hstack((CvalsArr, err_linArr, err_rbfArr, err_polyArr, err_sigArr))\
                                            , columns = ['Regularizaion Strengths, C', 'linear SVM err'\
                                                         , 'rbf SVM err'\
                                                         , 'poly SVM err'\
                                                         , 'sigmoid SVM err']) )

totAccTable.layout.width = 900
totAccTable.layout.margin.update({'t':75, 'l':50})
totAccTable.layout.update({'title': 'Support Vector Machine Classifier Error Rates'})
py.iplot(totAccTable, filename = 'overSVMTable')

## Sub-process IV: C-Support Vector Classification
### Focus IV: Use SVM's with Minimal Errors to Progressively Predict Throughout NBA Seasons
Based on our tabulated results, we seee that using radial basis function and 3rd-degree polynomial kernels yield minimal errors when the regularization strength, $C$, is approximately equal to:

$ C = 10^5 $

In [36]:
def compileTrainData(teamsDict, year):
    flag = False
    for i in range(1,year+1):
        if not flag:
            flag = True
            Xtr = np.array(teamsDict[i])
            continue
        Xtr = np.vstack( (Xtr, np.array(teamsDict[i]) ) )
    return Xtr
    
trainingFeatureSets = []
trainingTargetSets = []
for year in play_years[:-1]:
    Xtr = compileTrainData(teamsDict, year)
    ytr = deepcopy(Xtr[:,0])
    Xtr = Xtr[:,1:]
    
    trainingFeatureSets.append(preprocessing.scale(Xtr))
    trainingTargetSets.append(ytr)

In [37]:
acc_rbf = []
acc_poly = []
for i, year in enumerate(play_years[1:]):
    Xts = np.array(teamsDict[year])
    yts = deepcopy(Xts[:,0])
    Xts =  preprocessing.scale(Xts[:,1:])
    
    svm_rbf = svm.SVC(kernel = 'rbf', C = C_min)
    svm_poly = svm.SVC(kernel = 'poly', C = C_min)

    svm_rbf.fit( trainingFeatureSets[i], trainingTargetSets[i] )
    svm_poly.fit( trainingFeatureSets[i], trainingTargetSets[i] )
    
    yts_hat_rbf = svm_rbf.predict(Xts)
    yts_hat_poly = svm_poly.predict(Xts)
    
        
    acc_i_rbf = np.mean(yts_hat_rbf == yts)
    acc_i_poly = np.mean(yts_hat_poly == yts)
    
    acc_rbf.append(acc_i_rbf)
    acc_poly.append(acc_i_poly)

In [38]:
trainYears = []
testYears = []

for curr in play_years[1:]:
    trainYears.append( str(2001) + '-' + str(2000 + curr) )
    testYears.append( str(2000 + curr) + '-' + str(2001 + curr) )

trainYears = np.transpose(np.array([trainYears]))
testYears = np.transpose(np.array([testYears]))

acc_rbf = np.transpose( np.array([acc_rbf]) )
acc_poly = np.transpose( np.array([acc_poly]) )

In [39]:
totAccTable = FF.create_table( pd.DataFrame( np.hstack((trainYears, testYears, acc_rbf, acc_poly))\
                                            , columns = ['Train Years', 'Test NBA Season'\
                                                         , 'rbf SVM Predict Acc.'\
                                                         , 'poly SVM Predict Acc.']) )

totAccTable.layout.width = 700
totAccTable.layout.margin.update({'t':75, 'l':50})
totAccTable.layout.update({'title': 'Accuracy of Progressively-Trained SVM Model Over Each NBA Season'})
py.iplot(totAccTable, filename = 'progAccSVMTable')

## Sub-process V: Random Forest Classification
### Focus I: Reformat Entire Dataset for Total Training
After our analysis of the support vector machine (SVM) classifier, we move forward to analyzing the accuracy of classifications using random forests or random decision forests for classification.

Random forests operate by constructing a multitude of decision tress at training time and outputting the class that is the mode of the classes. Rnadom decision forests correct for decision trees' habit of overfitting to their training set.

First, we reformat the Entire Dataset for total model training across the dataset to analyze the best regularization strength, $C$.

In [40]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamNames = np.transpose( np.array( [teams[year]['team']] ) )
        continue
    teamNames = np.vstack( (teamNames, np.transpose( np.array( [teams[year]['team']] ) ) ) )
X = np.hstack( ( teamNames, np.array(deepcopy(allTeamHist)) ) )
X = pd.DataFrame(X, columns = ["Team"] + columns)

del X['Team']
del X['Playoff']
del X['REB%']
del X['PF']
del X['TO']
del X['OR']
del X['FT%']

y = list( deepcopy(allTeamHist['Playoff']) )
Xs = preprocessing.scale(np.array(X))

nsamp = Xs.shape[0]
ntr = int(0.8*nsamp)
nts = nsamp - ntr

Xtr = Xs[:ntr,:]
ytr = y[:ntr]

Xts = Xs[ntr:,:]
yts = y[ntr:]


Data with input dtype object was converted to float64 by the scale function.



## Sub-process V: Random Forest Classification
### Focus II: Intialize Random Forest Model Instance and Test Seperate Regularization Strengths on Entire Datatset Prediction
Afterwards, we intialize a random forst model instance and test our classifier across several numbers of trees to use.

In [41]:
num_est_vals = np.logspace(1,3,15)
num_est_vals = num_est_vals.ravel().tolist()
for i in range(len(num_est_vals)):
    num_est_vals[i] = int(num_est_vals[i])

err = []

err_min = 1

flag = False
for num_est in num_est_vals:
    if not flag:
        flag = True
        est_min = num_est
        
    rfModel = RandomForestClassifier(n_estimators = num_est)
    
    rfModel.fit(Xs,y)
    
    y_hat = rfModel.predict(Xtr)
    
    err_i = np.mean(y_hat != ytr)
    
    err.append(err_i)
    
    if (err_min > err_i):
        err_min = err_i
        est_min = num_est

## Sub-process V: Random Forest Classification
### Focus III: Tabulate & Analyze Entire Dataset Training Errors

Next, we tabulate our results and analyze the errors based on the tree numbers on each of our Random Forest classifier models.

In [42]:
num_est_valsArr = np.transpose( np.array([num_est_vals]))

errArr = np.transpose( np.array([err]) )

totAccTable = FF.create_table( pd.DataFrame( np.hstack((num_est_valsArr, errArr))\
                                            , columns = ['Number of Estimators (Trees)', 'error rate']) )

totAccTable.layout.width = 500
totAccTable.layout.margin.update({'t':75, 'l':50})
totAccTable.layout.update({'title': 'Random Forest Classifier Error Rates'})
py.iplot(totAccTable, filename = 'overRandomForestTable')

## Sub-process V: Random Forest Classification
### Focus IV: Use Random Forest Models with Minimal Errors to Progressively Predict Throughout NBA Seasons

In [43]:
def compileTrainData(teamsDict, year):
    flag = False
    for i in range(1,year+1):
        if not flag:
            flag = True
            Xtr = np.array(teamsDict[i])
            continue
        Xtr = np.vstack( (Xtr, np.array(teamsDict[i]) ) )
    return Xtr
    
trainingFeatureSets = []
trainingTargetSets = []
for year in play_years[:-1]:
    Xtr = compileTrainData(teamsDict, year)
    ytr = deepcopy(Xtr[:,0])
    Xtr = Xtr[:,1:]
    
    trainingFeatureSets.append(preprocessing.scale(Xtr))
    trainingTargetSets.append(ytr)

In [44]:
accRF = []
for i, year in enumerate(play_years[1:]):
    Xts = np.array(teamsDict[year])
    yts = deepcopy(Xts[:,0])
    Xts =  preprocessing.scale(Xts[:,1:])
    
    rfModel = RandomForestClassifier(est_min)
    
    rfModel.fit( trainingFeatureSets[i], trainingTargetSets[i] )
        
    yts_hat = rfModel.predict_proba(Xts)
    
    playoffTeams = np.array(teams[year]['team'])[np.where(np.array(yts) == True)]
    
    sortedPlayPred = np.argsort(yts_hat[:,1])
    predictedTeams = np.array(teams[year]['team'])[sortedPlayPred[-16:]]
    
    corr = 0
    i = 0
    
    for predict in predictedTeams:
        i += 1
        if predict in playoffTeams:
            corr += 1
    acc_i = float(corr)/i
    
    accRF.append(acc_i)

In [45]:
trainYears = []
testYears = []

for curr in play_years[1:]:
    trainYears.append( str(2001) + '-' + str(2000 + curr) )
    testYears.append( str(2000 + curr) + '-' + str(2001 + curr) )

trainYears = np.transpose(np.array([trainYears]))
testYears = np.transpose(np.array([testYears]))

acc_RF = np.transpose( np.array([accRF]) )

In [46]:
totAccTable = FF.create_table( pd.DataFrame( np.hstack((trainYears, testYears, acc_RF))\
                                            , columns = ['Train Years', 'Test NBA Season'\
                                                         , 'accuracy']) )

totAccTable.layout.width = 750
totAccTable.layout.margin.update({'t':75, 'l':50})
totAccTable.layout.update({'title': 'Accuracy of Progressively-Trained Random Forest Model Over Each NBA Season'})
py.iplot(totAccTable, filename = 'progAccRandomForestTable')

## Conclusion for NBA-Playoff Predictions

Based on our results for the earlier seasons in the NBA, it makes sense to use the logistic regression classifiers for predicting NBA-Playoff contention, however we see that as we obtain more training data and try to predict some of the more recent NBA-Playoff contenders, the random forest classifier model's accuracy begins to level with that of the logistic regression classifiers. Therefore, we hypothesize that as we continue to obtain more data for the coming NBA seasons, the random forest classifier and logistic classifiers will continue to be the most accurate of the classifier models that we analyze in this project.

In [47]:
totalAcc = [accl1, accl2, acc, acc_rbf, acc_poly, acc_RFArr]
names = ['l1-penalized logistic', 'l2-penalized logistic'
         , 'unregularized logistic', 'radial basis SVM'
         , 'polynomial SVM', 'random forest']

data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                x = currAcc.ravel().tolist(),
                y = teamYears,
                orientation = 'h',
                name = name
    )
    data.append(trace)

layout = go.Layout(
            title = 'Progressive Classifier Model Accuracies Throughout NBA Seasons',
            autosize = True,
            width = 800,
            height = 800)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'All Model Acccuracies Throughout NBA Seasons')
allAccPlot