# Supervised Machine Learning Methods for Predicting NBA Playoff Contention

## Created by: Emanuel Azcona
## Course: Introduction to Machine Learning & Senior Thesis

Import the required libraries for handling dataframes, arrays, and generating the dataframes.

In [1]:
import sys
sys.path.insert(0, '../')
from gen_data import gen_data

import numpy as np, pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in(username='emanuelazcona2022', api_key='mHqOmwUSmqFYjzMNuQVz')

from copy import deepcopy

from operator import itemgetter

from collections import OrderedDict

from sklearn import linear_model, preprocessing
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from statsmodels.stats.weightstats import ztest
import _pickle as cPickle

## Create Cumulative Player Databases

First, we use our user-created function to develop two seperate dictionaries:

- A dictionary of players by NBA Season
- A dictionary of teams by NBA Season

Each dictionary entry (in both dictionaries) is a Pandas dataframe

In [2]:
# first year of NBA seasons we'd like to store
start = 1
# end year ....
end = 16

# returns two dictionaries:
# 1. player dictionary where all NBA players are available in a dataframe, 
#       each dict. key is the NBA season's start year, ex: 2001 = 1, 2003 = 3
# 2. team dictionary .....
players, teams = gen_data(start,end)

# store all the NBA years we used in a list
play_years = list( players.keys() )

## Create Dictionary of NBA Seasons with NBA teams

Next, we create a dictionary of NBA seasons with each key referencing a dictionary of NBA teams for that respective season. Each entry of the teams dictionary has a dataframe of players on that team, in that year essentially.

- NBA Season
    - Team
        - Players
        
Example:

- '15-16'
    - Oklahoma City Thunder (okl)
        - Total Team Stats.
        - Players & Player Stats.
            - durant,kevin
                - GP
                - FGM
                - etc..
            - westbrook,russell
                - GP
                - FGM
                - etc..
            - etc..

In [3]:
# create basketball team class which stores players for team and total team statistics
class basketBallTeam:
    def __init__(self, players):
        self.players = players
        totalStats = []

# function for computing the overall team statistics of a team
def computeTeamStats(currTeam, i):
    
    # sum up field goals made by all players
    FGM = np.sum(currTeam[:,5])
    
    # sum up field goals attempted
    FGA = np.sum(currTeam[:,6])
    
    # Three pointers made
    TM = np.sum(currTeam[:,7])
    
    # Three pointers attempted
    TA = np.sum(currTeam[:,8])
    
    # Free-throws made
    FTM = np.sum(currTeam[:,9])
    
    # Free-throws attempted
    FTA = np.sum(currTeam[:,10])
    
    # Offensive rebounds made
    OR = np.sum(currTeam[:, 11])
    
    # Total rebounds made
    TR = np.sum(currTeam[:,12])
    
    # Totals assist count
    AS = np.sum(currTeam[:,13])
    
    # Total assists count
    ST = np.sum(currTeam[:,14])
    
    # Total turnovers count
    TO = np.sum(currTeam[:,15])
    
    # Total blocks count
    BK = np.sum(currTeam[:,16])
    
    # Personal fouls count
    PF = np.sum(currTeam[:,17])
    
    # Total points made
    PTS = np.sum(currTeam[:,19])
    
    # Playoff
    PLAY = bool(np.array(teams[year])[i,21])
    
    # MVP Status defaulted to 0
    MVP = 0
    
    # Conference (0 for East, 1 for West)
    CONF = 0
    
    # If MVP is found, 
    mvpArray = currTeam[:,23]
    if True in mvpArray:
        MVP = 1
    
    # If West is found,
    if 'West' in currTeam[:,-2]:
        CONF = 1
    
    # create list of teams stats
    """ [Conference, Playoff, FGM, FG%, 3M, 3PT%, FTM, FT%, OR, REB%,
         AS, ST, TO, BK, PF, PTS, MVP] """
    currTeamStats = [CONF, PLAY, FGM,float(FGM)/FGA, TM, float(TM)/TA\
                     , FTM,float(FTM)/FTA, OR, float(OR)/TR\
                     , AS, ST, TO, BK, PF, PTS, MVP]
    return currTeamStats

# create dictionary of NBA teams
columns = ['Conference','Playoff', 'FGM', 'FG%'\
            ,'3M', '3PT%', 'FTM'\
            ,'FT%', 'OR', 'REB%'\
            ,'AS', 'ST', 'TO'\
            ,'BK', 'PF', 'PTS', 'MVP']

# create empty dictionary of NBA Seasons
nbaSeasons = {}

# iterate through NBA seasons
for year in play_years:
    
    # create lists of all team names for this year
    teamNames = teams[year]['key'].tolist()
    
    # create empty dictionary of teams for this year
    teamsDict = {}
    for i, team in enumerate(teamNames):
        
        # initialize empty Team list
        currTeam = []
        
        # grab all players for this current season
        currPlaySeason = np.array( players[year] )
        
        # parse through all players in current season
        for j in range(currPlaySeason.shape[0]):
            
            # if the current player's team is equal to the team we are creating
            if(currPlaySeason[j,1] == team):
                
                # append all of their features as an element in the list
                currTeam.append(currPlaySeason[j,:])
        
        # turn team list into a numpy array
        currTeam = np.array(currTeam)
        
        # store current team as a dataframe in the dictionary of teams
        teamsDict[team] = basketBallTeam(pd.DataFrame(currTeam\
                                                      , columns = players[play_years[0]].columns.tolist()))
        
        # compute total stats for current team
        currTeamStats = computeTeamStats(currTeam, i)
        
        # add total stats as dataframe for current team
        teamsDict[team].totalStats = pd.DataFrame(np.array([currTeamStats])\
                                                  , columns = columns)
    
    # store dictionary to NBA year
    nbaSeasons[year] = teamsDict

## Create Pandas DataFrames of All NBA Teams & All NBA Players

Parsing through the nbaSeasons dictionary, we create a dataframe of every NBA player that's ever played in any NBA season. The same process is done for the teams as well.

In [4]:
# boolean flag to indicate not to vertically stack but instead initialize a NumPy array
flag = False
for year in play_years:
    # create lists of all team names for this year
    teamNames = teams[year]['key'].tolist()
    
    # parse through team names
    for team in teamNames:
        
        # if it's the first team, initialize numpy array
        if(not flag):
            flag = True
            allTeamHist = np.array( nbaSeasons[year][team].totalStats )
            allPlayHist = np.array( nbaSeasons[year][team].players )
            continue
        
        # else vertically stack next few teams
        allTeamHist = np.vstack( (allTeamHist, np.array(nbaSeasons[year][team].totalStats) ) )
        allPlayHist = np.vstack( (allPlayHist, np.array(nbaSeasons[year][team].players) ) )

# create columns labels for teams and players 
columnsTeam = columns
columnsPlayers = players[play_years[0]].columns.tolist()

# create all team dataframe
allTeamHist = pd.DataFrame(allTeamHist, columns = columnsTeam)
allTeamHist = allTeamHist.dropna()

# create all player dataframe
allPlayHist = pd.DataFrame(allPlayHist, columns = columnsPlayers )
allPlayHist = allPlayHist.dropna()

## Analyze NBA Team Data
### Task I: Overall NBA Team Statistics

Let's take a deeper look into the NBA Team data and analyze relationships across seasons. First, we compile a dataframe of total integer stats and average fractional stats for each individual NBA team's franchise history.

In [5]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamKeys = np.transpose( np.array( [teams[year]['key']] ) )
        continue
    teamKeys = np.vstack( (teamKeys, np.transpose( np.array( [teams[year]['key']] ) ) ) )

# create a deep copy of allTeamHist in order to not manipulate or lose important data in allTeamHist
X = np.array(deepcopy(allTeamHist)) 
X = pd.DataFrame(X, columns = columns, index = teamKeys.ravel())
del X['Conference']
del X['Playoff']
del X['MVP']

In [6]:
# function for computing overall sum of stats for each team (not by season but for entire NBA track starting in 2001)
def computeOverallTeamStats(X, team, statNames):
    overTeamStats = []
    
    # parse through stats you wish to compute for 
    for stat in statNames:
        # append sum of those stats for the same team across all seasons
        if stat in ['FG%', '3PT%', 'FT%', 'REB%']:
            overTeamStats.append( X[X.index == team][stat].mean(axis = 0) )
        else:
            overTeamStats.append( X[X.index == team][stat].sum(axis = 0) )
    return np.array(overTeamStats)

# all stats we wish to compute overall sum for
allStatNames = columnsTeam[2:-1]

# create list of teams already checked
teamsAlreadyChecked = []
flag = False

# parse through teams
for team in list(X.index):
    
    # if the team was already checked, continue to the next team
    if team in teamsAlreadyChecked:
        continue
    else:
        teamsAlreadyChecked.append(team)
        
        # if the numpy matrix for the overall team stats hasn't been made, initialize it
        if not flag:
            flag = True
            # compute overall team stats for current team
            overTeamStats = computeOverallTeamStats(X, team, allStatNames)
            continue
        # compute overall team stats for current team
        currTeamStats = computeOverallTeamStats(X, team, allStatNames)
        # vertically stack next team
        overTeamStats = np.vstack( (overTeamStats, currTeamStats) )

# create dataframe out of numpy array
overTeamStatsDF = pd.DataFrame(overTeamStats, columns = allStatNames, index = teamsAlreadyChecked)

Next, we obtain the plot of all the integer stats to analyze the team's total sum of franchise stats. Using the Python library, plotly, we obtain an organized (and visually asthetic), multi-bar graph.

- Integer Stats
    - Field Goals Made (FGM)
    - Three Pointers Made (3M)
    - Free Throws Made (FTM)
    - Offensive Rebounds (OR)
    - Assists (AS)
    - Steals (ST)
    - Turnovers (TO)
    - Blocks (BK)
    - Personal Fouls (PF)
    - Points Made (PTS)

In [7]:
# obtain integer stat labels

# turn keys for fractional stats into a list of keys
intStats = overTeamStatsDF.columns.tolist()

# fractional stat keys we wish to remove (non-integer stats)
removeFields = ['FG%', '3PT%', 'FT%', 'REB%']

# remove fractional stat keys
for field in removeFields:
    intStats.remove(field)

# extract all integer data and create multicolumn barcharts of overall integer stats
data = []
for team in list(overTeamStatsDF.index):
    teamStats = np.array(overTeamStatsDF[overTeamStatsDF.index == team]).ravel().tolist()
    
    
    for fracLoc in [1, 2, 3, 4]:
        teamStats.remove(teamStats[fracLoc])

    # create bar chart traces for plotly chart
    trace = go.Bar(
        x = intStats,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Overall Total Sum of Team Integer Statistics (2001-2016)',
    xaxis = dict(
        title = 'Integer Statistics'
    ),
    yaxis = dict(
        title = 'Integer Score'
    )
)

Afterwards, we plot the fractional stats the same manner we did the integer stats.

- Fractional Stats
    - Field Goal Percentage (FG%)
    - Three Point Percentage (3PT%)
    - Free Throw Percentage (FT%)
    - Offensive Rebound Perentage (REB%)

In [8]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=1500 #width in pixels
fig.layout.height = 800
py.iplot(fig, filename = 'overall team integer features')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~emanuelazcona2022/0 or inside your plot.ly account where it is named 'overall team integer features'


In [9]:
# obtain fractional stats (same as integer stats process)
fracStats = overTeamStatsDF.columns.tolist()

# integer stat keys we wish to remove
removeFields = ['FGM', '3M', 'FTM', 'OR', 'AS', 'ST', 'TO', 'BK', 'PF', 'PTS']

# remove integer stat keys and keep fractional stat keys
for field in removeFields:
    fracStats.remove(field)

# repeat for fractional stats
data = []
for team in list(overTeamStatsDF.index):
    teamStats = itemgetter(*[1,3,5,7])(np.array(
        overTeamStatsDF[overTeamStatsDF.index == team]).ravel().tolist())

    trace = go.Bar(
        x = fracStats,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Overall Average of Team Fraction Statistics (2001-2016)',
    xaxis = dict(
        title = 'Fractional Statistics'
    ),
    yaxis = dict(
        title = 'Percentage Score'
    )
)

In [10]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=1500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'overall team fractional features')

## Analyze NBA Team Data
### Task II: Analyze Individual NBA Team Statistics By Year
For our next analysis, we want to observe the behavior of NBA teams through the different seasons in our dataset (2001-2002 through 2015-2016). First, we add the New Orleans teams to the first three seasons of our NBA dataset, with stats. of 0 across the board for all three years, since the New Orleans teams were not introduced until the 2004-2005 NBA Season.

In [11]:
# create deepcopy of X in order to not damage data
Xarr = deepcopy(X)

# convert to NumPy array
Xarr = np.array(Xarr)

# add NOrleans team with stats of 0 across the board for the first three seasons
for i in range(3):
    Xarr = np.vstack((np.array([0 for i in range(Xarr.shape[1])]), Xarr ))

# reconvert back to dataframe with newly added NOrleans team stats
XarrDF = pd.DataFrame(Xarr, columns = X.columns.tolist(), index = ['nor']*3 + X.index.tolist())

#### Sub-Task I: Team Field Goals per Season
Next, we construct our bar graph to analyze the performance of all NBA teams throughout each of the NBA seasons in our dataset. The first feature we analyze is the numebr of field goals a team makes in each NBA season.

In [12]:
# extract team years list where each entry is a string in the form of: 2001-2002
teamYears = []
for year in play_years:
    teamYears.append(str(year + 2000) + '-' + str(year + 2001))

In [13]:
# Plot field goals

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['FGM']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Field Goals Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Number of Field Goals Made in Season'
    )
)

In [14]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'field goals made per season')

#### Sub-Task II: Team Three Pointers per Season
The folllowing feature we analyze is the number fo three-pointers a team scores in an season.

In [15]:
# Plot threes made

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['3M']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Threes Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Threes Made in a Season'
    )
)

In [16]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'threes made per season')

#### Sub-Task III: Team Assists per Season
Afterwards, we look closely at the number of assists per NBA season for each NBA team.

In [17]:
# Plot assists made per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['AS']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Assists per Teams per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Assists in a Season'
    )
)

In [18]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'assists per season')

#### Sub-Task IV: Points Scored per Season
The next feature we analyze is the number of points a team scores in each NBA season.

In [19]:
# Plot points scored per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['PTS']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Points Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Points Made in a Season'
    )
)

In [20]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'points made per season')

#### Sub-Task V: Team Free-Throws Made per Season
The next feature we analyze is the number of free-throws a team scores in each NBA season.

In [21]:
# Plot free throws made per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['FTM']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Free-Throws Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Free-Throws Made in a Season'
    )
)

In [22]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'free throws made per season')

#### Sub-Task VI: Team Steals per Season
The next feature we analyze is the number of steals teams commit per season.

In [23]:
# Plot steals made per team

data = []
for team in teams[year]['key']:
    teamStats = np.array(XarrDF[XarrDF.index == team]['ST']).tolist()

    trace = go.Bar(
        x = teamYears,
        y = teamStats,
        name = team
    )
    data.append(trace)

layout = go.Layout(
    barmode = 'group',
    title = 'Steals Made by Teams Per Season (2001-2016)',
    xaxis = dict(
        title = 'NBA Season'
    ),
    yaxis = dict(
        title = 'Steals Made in a Season'
    )
)

In [24]:
fig = go.Figure(data = data, layout = layout)
fig.layout.width=2500 #width in pixels
fig.layout.height = 750
py.iplot(fig, filename = 'steals made per season')

# Predicting NBA Playoff Teams
## Sub-process I: Determining the Statistical Significance of Team Stats to Playoff Status
For the first prediction, we will focus on predicting a whether an NBA team is cable of becoming a Playoff contender.

In this initial step, we focus on determining the statistical significance of features to NBA-Playoff contention. First, we look for the index of every NBA team that has been in the NBA-Playoffs during their respective season and those that weren't and store these indices in seperate NumPy arrays.

In [25]:
# record the indices of playoff teams, and non playoff teams

Iplay = np.where( np.array(allTeamHist['Playoff']) == True )[0]
Inon = np.where( np.array(allTeamHist['Playoff']) == False )[0]

Next, we create a deep copy of the allTeamHist dataframe, but remove irrelevant features such as:
- Playoff Indication (Playoff)
- Conference Indication (Conference)

because this is what we're trying to predict. In other words, we remove the target variable.

In [26]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamNames = np.transpose( np.array( [teams[year]['team']] ) )
        continue
    teamNames = np.vstack( (teamNames, np.transpose( np.array( [teams[year]['team']] ) ) ) )
X = np.hstack( ( teamNames, np.array(deepcopy(allTeamHist)) ) )
X = pd.DataFrame(X, columns = ["Team"] + columns)

In [27]:
# only plot the first 6 rows of teams to save computation and plotting time (takes very long to plot ALL teams)

# allNBATeamsTable = FF.create_table(X.head(6))
# allNBATeamsTable.layout.width=2500 #width in pixels
# allNBATeamsTable.layout.margin.update({'t':75, 'l':50})
# allNBATeamsTable.layout.update({'title': 'All NBA Team Statistics (First 6)'})
# py.iplot(allNBATeamsTable, filename = 'All NBA Team Stats')
X.head(6)

Unnamed: 0,Team,Conference,Playoff,FGM,FG%,3M,3PT%,FTM,FT%,OR,REB%,AS,ST,TO,BK,PF,PTS,MVP
0,AtlantaHawks,0,0,2903,0.43905,423,0.354271,1487,0.765311,955,0.280717,1656,667,1204,350,1703,7716,0
1,BostonCeltics,0,1,3085,0.425517,759,0.359545,1580,0.771108,976,0.268353,1746,820,1136,301,1886,8509,0
2,CharlotteHornets,0,1,2893,0.439666,346,0.348089,1568,0.744893,1059,0.297138,1759,653,1108,456,1746,7700,0
3,ChicagoBulls,0,0,2458,0.441372,280,0.348259,1115,0.734036,670,0.248148,1498,493,973,329,1597,6311,0
4,ClevelandCavaliers,0,0,2948,0.447888,387,0.377193,1529,0.772222,968,0.280498,1891,572,1129,470,1752,7812,0
5,DallasMavericks,1,1,3660,0.461248,687,0.381032,1730,0.800185,946,0.259391,2219,623,1071,520,1925,9737,0


In [28]:
del X['Team']
del X['Playoff']
del X['Conference']

Then, we import the ztest function from the Python statsmodel.stats.weightstats module to perform a Z-test on our features. We do this to determine the significance of our features in the determination of the NBA-Playoff contenders.

In [29]:
# compute Z-Test on all features and compute statistical significance of each feature

statDF = []
for label in X.columns.tolist():
    play = np.array(X[label])[Iplay]
    non = np.array(X[label])[Inon]
    stat, pval = ztest(play, non)
    statDF.append( [label, stat, pval] )
    
statDF = pd.DataFrame(statDF, columns = ['Feature', 'statistic', 'p-val'])
statDF = statDF.sort_values (['p-val'])

In [30]:
#tabulate results of Z-Test

# allTeamsZtestTable = FF.create_table(statDF)
# allTeamsZtestTable.layout.width = 600
# allTeamsZtestTable.layout.margin.update({'t':75, 'l':50})
# allTeamsZtestTable.layout.update({'title': 'Features Statistical Significance'})
# py.iplot(allTeamsZtestTable, filename = 'Feat Stat Sig')

statDF

Unnamed: 0,Feature,statistic,p-val
1,FG%,8.768354,1.8129710000000002e-18
8,AS,8.384611,5.089396e-17
13,PTS,7.405338,1.30817e-13
0,FGM,6.609486,3.856556e-11
9,ST,6.408128,1.473176e-10
3,3PT%,6.251027,4.077616e-10
2,3M,5.57398,2.489848e-08
7,REB%,-5.208846,1.900186e-07
11,BK,5.141859,2.720336e-07
4,FTM,4.598369,4.258115e-06


Our analysis shows that features such as:
- Offensive Rebounding Percentage (REB%)
- Personal Fouls (PF)
- Turnovers (TO)
- Offensive Rebounds (OR)
- Free Throw Percentage (FT%)

do not have a real significance in the determination of the NBA-Playoff contention. Therefore, we remove these features from our feature matrix since they probabilistically reject our null hypothesis. Therefore, our most significant performance attributes (in order from most important down) are:
- Field Goal Percentage (FG%)
- Assists (AS)
- Points (PTS)
- Field Goals Made (FGM)
- Steals (ST)
- Three Point Percentage (3PT%)
- Threes Made (3M)
- Blocks (BK)

In [31]:
del X['REB%']
del X['PF']
del X['TO']
del X['OR']
del X['FT%']

In [32]:
from time import time

## Sub-process II: Developing Training Data
Now, we prepare a vector of boolean values indicating whether a team was an NBA-Playoff contender or not. We also cast the dataframe of predictors we have to a NumPy array type.

In [33]:
y = list( deepcopy(allTeamHist['Playoff']) )
Xs = preprocessing.scale(np.array(X))


Data with input dtype object was converted to float64 by the scale function.



## Sub-process III: Logistic Regression Classifier
### Task I: Train Classifier on Entire Dataset
Next, we develop a logistic classifier model using sklearn to predict NBA Playoff Contenders. First we train our logistic regression classifier on all of the NBA teams that have ever existed. Using this, we test out a range of regularization strengths, $C$, to determine what the best one is.

In [34]:
Cvals = np.logspace(-2,6,20)
C_max = [None for _ in range(3)]


# initialize minimum error rate of 100%
acc_max = [0 for _ in range(3)]
accLog = [[] for _ in range(3)]

y_hat = [None for _ in range(3)]

logReg = [None for _ in range(3)]

# parse through C values, train the logistic classifier for each C value, and compute the error rate of the model
# for the corresponding C value
for Cl1, Cl2, C in zip(Cvals, Cvals, Cvals):
    
    # create instance of logistic classifier (l1-penalized, l2-penalized, and unregularized)
    logReg[0] = linear_model.LogisticRegression(penalty = 'l1', C = Cl1)
    logReg[1] = linear_model.LogisticRegression(penalty = 'l2', C = Cl2)
    logReg[2] = linear_model.LogisticRegression(C = C)
    
    # train model with training data
    for i in range(3):
        logReg[i].fit(Xs, y)
        y_hat[i] = logReg[i].predict(Xs)
        accLog[i].append( np.mean(y_hat[i] == y)*100 )
        
        if (acc_max[i] < accLog[i][-1]):
            acc_max[i] = accLog[i][-1]
            
            if i is 0:
                C_max[i] = Cl1
            elif i is 1:
                C_max[i] = Cl2
            else:
                C_max[i] = C

In [35]:
acc_l1_log = np.transpose( np.array([accLog[0]]) )
acc_l2_log = np.transpose( np.array([accLog[1]]) )
acc_log = np.transpose( np.array([accLog[2]]) )
Cvals = np.transpose( np.array([Cvals]) )

In [36]:
accTable = pd.DataFrame( np.hstack((Cvals, acc_l1_log, acc_l2_log, acc_log))\
                                            , columns = ['Reg. Strength, $C$'
                                                         , '$\ell_1$ Acc., %'\
                                                         , '$\ell_2$ Acc., %'\
                                                         , 'Un-norm. Acc., %'])

# totAccTable = FF.create_table(accTable)

# totAccTable.layout.width = 1e3
# totAccTable.layout.margin.update({'t':75, 'l':50})
# totAccTable.layout.update({'title': 'Accuracy of Logistic Regression, Regularization Strengths on Total-Train Total-Predict Model'})
# py.iplot(totAccTable, filename = 'logRegCAccTable')
accTable


Unnamed: 0,"Reg. Strength, $C$","$\ell_1$ Acc., %","$\ell_2$ Acc., %","Un-norm. Acc., %"
0,0.01,46.308725,71.812081,71.812081
1,0.026367,70.469799,72.930649,72.930649
2,0.069519,71.14094,73.378076,73.378076
3,0.183298,73.60179,72.930649,72.930649
4,0.483293,72.706935,72.930649,72.930649
5,1.274275,73.60179,73.154362,73.154362
6,3.359818,73.154362,73.154362,73.154362
7,8.858668,73.378076,73.378076,73.378076
8,23.357215,73.378076,73.378076,73.378076
9,61.584821,73.378076,73.378076,73.378076


In [37]:
accTable.to_csv("logRegCAccuracy.csv", encoding='utf-8', index = False)
C_max

[0.18329807108324356, 0.069519279617756058, 0.069519279617756058]

Based on our analysis, we keep the regularization strength, $C$, which minimizes the error in predicting the NBA-Playoff contenders for all three of our models.

Now, we test our trained model to predict NBA-Playoff contenders in each NBA season.

## Sub-process III: Logistic Regression Classifier
### Task II: Use Entire-Trained-Classifier to Predict Playoffs in Each NBA Season
Using the regularization strength, $C$, with the least error in predicting the NBA-Playoff contender, we now use our traiend classifier to predict Playoff teams in each individual NBA Season, by iterating through each season.

In [38]:
# function that seperates top NBA teams so that there are 8 Eastern & 8 Western Playoff teams
def sepPredTeamsToEastAndWest(predTeams, eastTeams, westTeams):
    west, westPredict = 0, []
    east, eastPredict = 0, []
    
    # parse through predicted teams
    for team in predTeams:
        if team in westTeams and west < 8:
            west += 1
            westPredict.append(team)
            continue
        if team in eastTeams and east < 8:
            east += 1
            eastPredict.append(team)
            continue
    return eastPredict, westPredict

# function for computing accuracy of prediction
def computeAccForPredict(eastPredict, eastTruth, westPredict, westTruth):
    # to ensure that we do not have more than 8 teams per conference
    west = 0
    east = 0
    
    # intialize number of correct predictions to 0
    corr = 0
    for eastTeam, westTeam in zip(eastPredict, westPredict):
        
        if westTeam in westTruth and west < 8:
            west += 1
            corr += 1
        if eastTeam in eastTruth and east < 8:
            east += 1
            corr += 1
            
    return float(corr)/16

In [39]:
# Use logistic classifier that predicts with minimal error to predict playoff contenders per season, with
# training data from all the previous seasons before the testing set

logReg[0] = linear_model.LogisticRegression(penalty = 'l1', C = C_max[0])
logReg[1] = linear_model.LogisticRegression(penalty = 'l2', C = C_max[1])
logReg[2] = linear_model.LogisticRegression(C = C_max[2])

for i in range(3):
    logReg[i].fit(Xs,y)

accLog = [[] for _ in range(3)]
acc_l1_log = []
acc_l2_log = []
acc_log = []

for year in play_years:
    currSeason = nbaSeasons[year]
    teamNames = teams[year]['key']
    
    flag = False
    for team in teamNames:
        if(not flag):
            flag = True
            teamArr = np.array(currSeason[team].totalStats)
            continue
        teamArr = np.vstack((teamArr,np.array(currSeason[team].totalStats)))
    
    # Create training data for current year
    X = pd.DataFrame(teamArr, columns = columns)
    
    # copy expected output values
    y = list(deepcopy(X['Playoff']))
    
    # labels to delete, uneccesary for data fitting
    del_labels = ['REB%', 'PF', 'TO', 'OR', 'FT%', 'Playoff', 'Conference']
    for label in del_labels:
        del X[label]
    
    # scale/normalize X
    Xts = preprocessing.scale(X)
    
    # predict y
    y_hat_l1 = logReg[0].predict_proba(Xts)
    y_hat_l2 = logReg[1].predict_proba(Xts)
    y_hat = logReg[2].predict_proba(Xts)
    
    # find indices of expected Playoff teams
    playOffIndices = np.where(np.array(y) == True)
    
    teamNames = teams[year]['team']
    
    # determine names for Western Conference teams
    westTeams = list(np.where(teams[year]['Conference'] == 'West')[0])
    westTeamNames = list(np.array(teamNames)[westTeams])

    # of those Western conference teams, create a list of the 8 Western Playoff teams
    temp = []
    westPlayOffIndices = list(playOffIndices[:][0])
    for i in westPlayOffIndices:
        if i in westTeams:
            temp.append(i)
    westPlayOffIndices = temp[:]
    westPlayoffTeams = np.array(teamNames)[westPlayOffIndices]
    
    # repeat for Eastern Conference teams
    eastTeams = list(np.where(teams[year]['Conference'] == 'East')[0])
    eastTeamNames = list(np.array(teamNames)[eastTeams])
    
    temp = []
    eastPlayOffIndices = list(playOffIndices[:][0])
    for i in eastPlayOffIndices:
        if i in eastTeams:
            temp.append(i)
    eastPlayOffIndices = temp[:]
    eastPlayoffTeams = np.array(teamNames)[eastPlayOffIndices]
    
    
    # seperate top 8 predicted Eastern Teams, and top 8 predicted Western Teams
    sortedPlayPred_l1 = np.argsort(y_hat_l1[:,1])
    predictedTeams_l1 = np.array(teamNames)[sortedPlayPred_l1[-1::-1]]
    eastPredictTeams_l1, westPredictTeams_l1 =\
        sepPredTeamsToEastAndWest(predictedTeams_l1, eastTeamNames, westTeamNames)
        
    # repeat for l2-normalized classifier
    sortedPlayPred_l2 = np.argsort(y_hat_l2[:,1])
    predictedTeams_l2 = np.array(teamNames)[sortedPlayPred_l2[-1::-1]]
    eastPredictTeams_l2, westPredictTeams_l2 =\
        sepPredTeamsToEastAndWest(predictedTeams_l2, eastTeamNames, westTeamNames)
    
    # repeat for unnormalized classifier
    sortedPlayPred = np.argsort(y_hat[:,1])
    predictedTeams = np.array(teamNames)[sortedPlayPred[-1::-1]]
    eastPredictTeams, westPredictTeams =\
        sepPredTeamsToEastAndWest(predictedTeams, eastTeamNames, westTeamNames)
    
    acc_i_l1 = computeAccForPredict(eastPredictTeams_l1\
                                   , eastPlayoffTeams\
                                   , westPredictTeams_l1\
                                   , westPlayoffTeams)
    
    acc_i_l2 = computeAccForPredict(eastPredictTeams_l2\
                                   , eastPlayoffTeams\
                                   , westPredictTeams_l2\
                                   , westPlayoffTeams)
    
    acc_i = computeAccForPredict(eastPredictTeams\
                                   , eastPlayoffTeams\
                                   , westPredictTeams\
                                   , westPlayoffTeams)
    
    acc_l1_log.append(acc_i_l1*100)
    acc_l2_log.append(acc_i_l2*100)
    acc_log.append(acc_i*100)
    
    accLog[0].append(acc_i_l1*100)
    accLog[1].append(acc_i_l2*100)
    accLog[2].append(acc_i*100)

Using, our predictions and ground truth data, we tabulate the accuracy of our entire-dataset-trained classifier for predicting the NBA-Playoff teams in each season.

In [40]:
# create NumPy array of team years and accuracies in order to horizontally stack them
playYearsString = np.transpose( np.array([teamYears]) )

acc_l1_log = np.transpose( np.array([acc_l1_log]) )
acc_l2_log = np.transpose( np.array([acc_l2_log]) )
acc_log = np.transpose( np.array([acc_log]) )

In [41]:
accTable = pd.DataFrame( np.hstack((playYearsString, acc_l1_log, acc_l2_log, acc_log))\
                                            , columns = ['NBA Season'
                                                         , '$\ell_1$ Acc., %'\
                                                         , '$\ell_2$ Acc., %'\
                                                         , 'Un-normalized Acc., %'])

# totAccTable = FF.create_table(accTable)

# totAccTable.layout.width = 800
# totAccTable.layout.margin.update({'t':75, 'l':50})
# totAccTable.layout.update({'title': 'Accuracy of Total-Trained Model Over Each NBA Season'})
# py.iplot(totAccTable, filename = 'totAccTable')

accTable

Unnamed: 0,NBA Season,"$\ell_1$ Acc., %","$\ell_2$ Acc., %","Un-normalized Acc., %"
0,2001-2002,87.5,87.5,87.5
1,2002-2003,87.5,87.5,87.5
2,2003-2004,87.5,87.5,87.5
3,2004-2005,75.0,75.0,75.0
4,2005-2006,68.75,68.75,68.75
5,2006-2007,87.5,87.5,87.5
6,2007-2008,68.75,68.75,68.75
7,2008-2009,75.0,75.0,75.0
8,2009-2010,68.75,68.75,68.75
9,2010-2011,75.0,75.0,75.0


## Sub-process III: Logistic Regression Classifier
### Focus III: Create Dictionary of NBA Team Stats by Season
Next, we create a dictionary of NBA teams by the season, with each entry being the a dataframe of seasonal stats for a team.

In [42]:
# create ordered dictionary of NBA stats by season
teamsDict = OrderedDict()
for year in play_years:
    flag = False
    seasonTeams = []
    for team in teams[year]['key']:
        if not flag:
            flag = True
            seasonTeams = np.array(nbaSeasons[year][team].totalStats).ravel()
            continue
        seasonTeams = np.vstack( (seasonTeams, np.array(nbaSeasons[year][team].totalStats).ravel()) )
    
    seasonTeams = pd.DataFrame( seasonTeams, columns = columnsTeam, index = teams[year]['key'].ravel().tolist() )
    
    # delete irrelevant features as described by the Z-Test
    del seasonTeams['REB%']
    del seasonTeams['PF']
    del seasonTeams['TO']
    del seasonTeams['OR']
    del seasonTeams['FT%']
    del seasonTeams['Conference']
    teamsDict[year] = seasonTeams

## Sub-process III: Logistic Regression Classifier
### Focus IV: Train Classifier With All Years of NBA Team Data Prior to Each Season
In this focus area, we create two lists:
- Training Data List
- Testing Data List

The list organization is described by the following example:
Ex:
- Testing on 2013-2014 NBA Season
    - Training Data contains all team information from 2001-2013
    - Testing Data contains all team information for 2013-2014

In [43]:
def compileTrainData(teamsDict, year):
    flag = False
    for i in range(1,year+1):
        if not flag:
            flag = True
            Xtr = np.array(teamsDict[i])
            continue
        Xtr = np.vstack( (Xtr, np.array(teamsDict[i]) ) )
    return Xtr
    
trainingFeatureSets = []
trainingTargetSets = []
for year in play_years[:-1]:
    Xtr = compileTrainData(teamsDict, year)
    ytr = deepcopy(Xtr[:,0])
    Xtr = Xtr[:,1:]
    
    trainingFeatureSets.append(preprocessing.scale(Xtr))
    trainingTargetSets.append(ytr)

## Sub-process III: Logistic Regression Classifier
### Focus V: Predict NBA Playoff Contenders in Each Season & Determine Accuracy
Next, we use our trained model to predict NBA-Playoff contenders in each NBA season and compute the accuracy of our prediction for each season. Afterwards, we tabulate our results to analyze the relationship between progressively training the model with more data each season and predicting NBA-Playoff contenders. 

In [44]:
accLog = [[] for _ in range(3)]
trainTime = [[] for _ in range(3)]
predTime = [[] for _ in range(3)]

acc_i = [None for _ in range(3)]
logReg = [None for _ in range(3)]
yts_hat = [None for _ in range(3)]

predictedTeams = [None for _ in range(3)]
sortedPlayPred = [None for _ in range(3)]

eastPredictTeams = [None for _ in range(3)]
westPredictTeams = [None for _ in range(3)]

for i, year in enumerate(play_years[1:]):
    Xts = np.array(teamsDict[year])
    yts = deepcopy(Xts[:,0])
    Xts =  preprocessing.scale(Xts[:,1:])
    
    for j in range(3):
        if j in [0,1]:
            if j is 0:
                pen = 'l1'
            elif j is 1:
                pen = 'l2'
            logReg[j] = linear_model.LogisticRegression(penalty = pen, C = C_max[j])
        else:
            logReg[j] = linear_model.LogisticRegression(C = C_max[j])
        startTime = time()
        logReg[j].fit (trainingFeatureSets[i], trainingTargetSets[i])
        trainTime[j].append( (time() - startTime)*1e6 )
        
        startTime = time()
        yts_hat[j] = logReg[j].predict_proba(Xts)
        predTime[j].append( (time() - startTime)*1e6 )
    
    playOffIndices = np.where(np.array(yts) == True)
    
    westTeams = list(np.where(teams[year]['Conference'] == 'West')[0])
    westTeamNames = list(np.array(teams[year]['team'])[westTeams])

    temp = []
    westPlayOffIndices = list(playOffIndices[:][0])
    for j in westPlayOffIndices:
        if j in westTeams:
            temp.append(j)
    westPlayOffIndices = temp[:]
    westPlayoffTeams = np.array(teams[year]['team'])[westPlayOffIndices]
    
    eastTeams = list(np.where(teams[year]['Conference'] == 'East')[0])
    eastTeamNames = list(np.array(teams[year]['team'])[eastTeams])
    
    temp = []
    eastPlayOffIndices = list(playOffIndices[:][0])
    for j in eastPlayOffIndices:
        if j in eastTeams:
            temp.append(j)
    eastPlayOffIndices = temp[:]
    eastPlayoffTeams = np.array(teams[year]['team'])[eastPlayOffIndices]
    
    for j in range(3):
        sortedPlayPred[j] = np.argsort(yts_hat[j][:,1])
        predictedTeams[j] = np.array(teams[year]['team'])[sortedPlayPred[j][-1::-1]]
        eastPredictTeams[j], westPredictTeams[j] =\
            sepPredTeamsToEastAndWest(predictedTeams[j], eastTeamNames, westTeamNames)
        
        acc_i[j] = computeAccForPredict(eastPredictTeams[j]\
                                   , eastPlayoffTeams\
                                   , westPredictTeams[j]\
                                   , westPlayoffTeams)
    
        accLog[j].append(acc_i[j]*100)
        

In [45]:
cPickle.dump(logReg[0], open("logregl1.p", "wb"))
cPickle.dump(logReg[1], open("logregl2.p", "wb"))
cPickle.dump(logReg[2], open("logreg.p", "wb"))

In [46]:
trainYears = []
testYears = []

for curr in play_years[1:]:
    trainYears.append( str(2001) + '-' + str(2000 + curr) )
    testYears.append( str(2000 + curr) + '-' + str(2001 + curr) )

trainYears = np.transpose(np.array([trainYears]))
testYears = np.transpose(np.array([testYears]))

accl1 = np.transpose( np.array([accLog[0]]) )
accl2 = np.transpose( np.array([accLog[1]]) )
acc_ = np.transpose( np.array([accLog[2]]) )

trainTimel1 = np.transpose( np.array([trainTime[0]]) )
trainTimel2 = np.transpose( np.array([trainTime[1]]) )
trainTimeUn = np.transpose( np.array([trainTime[2]]) )

predTimel1 = np.transpose( np.array([predTime[0]]) )
predTimel2 = np.transpose( np.array([predTime[1]]) )
predTimeUn = np.transpose( np.array([predTime[2]]) )

In [47]:
# totAccTable = FF.create_table( pd.DataFrame( np.hstack((trainYears, testYears, accl1, accl2, acc_))\
#                                             , columns = ['Train Years', 'Test NBA Season'\
#                                                          , 'l1-Predict Acc.'\
#                                                          , 'l2-Predict Acc.'\
#                                                          , 'Un-normalized Predict Acc.']) )

# totAccTable.layout.width = 1000
# totAccTable.layout.margin.update({'t':75, 'l':50})
# totAccTable.layout.update({'title': 'Accuracy of Progressively-Trained Model Over Each NBA Season'})
# py.iplot(totAccTable, filename = 'progAccTable')

accTable = pd.DataFrame( np.hstack((trainYears, testYears, accl1, accl2, acc_))\
                                            , columns = ['Train Years', 'Test Season'\
                                                         , '$\ell_1$-Predict Acc., %'\
                                                         , '$\ell_2$-Predict Acc., %'\
                                                         , 'Un-normalized Predict Acc., %'] )

trainTable = pd.DataFrame( np.hstack((testYears, trainTimel1, trainTimel2, trainTimeUn))\
                        , columns = ['Test Season'\
                                    , '$\ell_1$ (usec.)'\
                                    , '$\ell_2$ (usec.)'\
                                    , '$un-norm.$ (usec.)'] )

predTable = pd.DataFrame( np.hstack((testYears, predTimel1, predTimel2, predTimeUn))\
                        , columns = ['Test Season'\
                                    , '$\ell_1$ (usec.)'\
                                    , '$\ell_2$ (usec.)'\
                                    , '$un-norm.$ (usec.)'] )

In [48]:
accTable.to_csv("logRegPredictAcc.csv", encoding='utf-8', index = False)
trainTable.to_csv("logRegTrainTime.csv", encoding = 'utf-8', index = False)
predTable.to_csv("logRegPredTime.csv", encoding = 'utf-8', index = False)
accTable

Unnamed: 0,Train Years,Test Season,"$\ell_1$-Predict Acc., %","$\ell_2$-Predict Acc., %","Un-normalized Predict Acc., %"
0,2001-2002,2002-2003,87.5,81.25,81.25
1,2001-2003,2003-2004,75.0,75.0,75.0
2,2001-2004,2004-2005,81.25,75.0,75.0
3,2001-2005,2005-2006,68.75,68.75,68.75
4,2001-2006,2006-2007,75.0,81.25,81.25
5,2001-2007,2007-2008,75.0,68.75,68.75
6,2001-2008,2008-2009,75.0,75.0,75.0
7,2001-2009,2009-2010,68.75,68.75,68.75
8,2001-2010,2010-2011,75.0,75.0,75.0
9,2001-2011,2011-2012,81.25,81.25,81.25


In [49]:
for i in range(len(accLog)):
    print(len(accLog[i]))

14
14
14


## Sub-process IV: C-Support Vector Classification
### Focus I: Reformat Entire Dataset for Total Training

After our analysis of the logistic regression classifier, we move forward to analyzing the accuracy of classifications using support vectore machines and c-support vector classification.

First, we reformat the Entire Dataset for total model training across the dataset to analyze the best regularization strength, $C$.

In [50]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamNames = np.transpose( np.array( [teams[year]['team']] ) )
        continue
    teamNames = np.vstack( (teamNames, np.transpose( np.array( [teams[year]['team']] ) ) ) )
X = np.hstack( ( teamNames, np.array(deepcopy(allTeamHist)) ) )
X = pd.DataFrame(X, columns = ["Team"] + columns)

del X['Team']
del X['Playoff']
del X['REB%']
del X['PF']
del X['TO']
del X['OR']
del X['FT%']
del X['Conference']

y = list( deepcopy(allTeamHist['Playoff']) )
Xs = preprocessing.scale(np.array(X))


Data with input dtype object was converted to float64 by the scale function.



## Sub-process IV: C-Support Vector Classification
### Focus II: Intialize SVM Model Instance and Test Seperate Regularization Strengths on Entire Datatset Prediction
Afterwards, we intialize an support vector machine model instance and test our classifier across several regularization strengths, $C$.

In [51]:
Cvals = np.logspace(-2,5,8)

acc = [[] for _ in range(4)]

accMax = [0 for _ in range(4)]
svmModels = [None for _ in range(4)]
yHat = [None for _ in range(4)]
C_max = [Cvals[0] for _ in range(4)]

flag = False
for C in Cvals:
    
    for i in range(4):
        if i is 0:
            k = 'linear'
        elif i is 1:
            k = 'rbf'
        elif i is 2:
            k = 'poly'
        else:
            k = 'sigmoid'
        
        svmModels[i] = svm.SVC(kernel = k, C = C)
        
        svmModels[i].fit(Xs, y)
        yHat[i] = svmModels[i].predict(Xs)
        acc[i].append( round(np.mean(yHat[i] == y)*100,2) )
    
    for i in range(4):
        if accMax[i] < acc[i][-1]:
            accMax[i] = acc[i][-1]
            C_max[i] = C
            
    for i, a in enumerate(acc):
        if i is 0:
            k = 'linear'
        elif i is 1:
            k = 'rbf'
        elif i is 2:
            k = 'poly'
        else:
            k = 'sigmoid'
    print(C)

0.01
0.1
1.0
10.0
100.0
1000.0
10000.0
100000.0


In [52]:
Cvals = np.transpose( np.array([Cvals]) )
acc_lin = np.transpose( np.array([acc[0]]) )
acc_rbf = np.transpose( np.array([acc[1]]) )
acc_poly = np.transpose( np.array([acc[2]]) )
acc_sig = np.transpose( np.array([acc[3]]) )

accTable = pd.DataFrame(np.hstack((Cvals, acc_lin, acc_rbf, acc_poly, acc_sig))\
                        ,columns = ['Reg. $C$', '$linear$ Acc.'\
                                    , '$rbf$ Acc.', '$poly$ Acc.'\
                                    , '$sigmoid$ Acc.'])

In [53]:
accTable.to_csv('svmRegStrength.csv', encoding = 'utf-8')
print(C_max)
accTable

[100000.0, 1000.0, 100000.0, 0.10000000000000001]


Unnamed: 0,Reg. $C$,$linear$ Acc.,$rbf$ Acc.,$poly$ Acc.,$sigmoid$ Acc.
0,0.01,71.81,53.69,57.27,63.31
1,0.1,73.83,72.71,66.22,70.92
2,1.0,73.38,77.63,76.96,65.77
3,10.0,73.6,86.35,80.31,66.0
4,100.0,73.6,95.08,87.7,66.22
5,1000.0,73.38,100.0,91.05,66.22
6,10000.0,73.38,100.0,93.74,66.22
7,100000.0,74.27,100.0,95.08,66.22


## Sub-process IV: C-Support Vector Classification
### Focus IV: Use SVM's with Minimal Errors to Progressively Predict Throughout NBA Seasons
Based on our tabulated results, we seee that using radial basis function and 3rd-degree polynomial kernels yield minimal errors when the regularization strength, $C$, is approximately equal to:

In [54]:
C_max

[100000.0, 1000.0, 100000.0, 0.10000000000000001]

In [55]:
def compileTrainData(teamsDict, year):
    flag = False
    for i in range(1,year+1):
        if not flag:
            flag = True
            Xtr = np.array(teamsDict[i])
            continue
        Xtr = np.vstack( (Xtr, np.array(teamsDict[i]) ) )
    return Xtr
    
trainingFeatureSets = []
trainingTargetSets = []
for year in play_years[:-1]:
    Xtr = compileTrainData(teamsDict, year)
    ytr = deepcopy(Xtr[:,0])
    Xtr = Xtr[:,1:]
    
    trainingFeatureSets.append(preprocessing.scale(Xtr))
    trainingTargetSets.append(ytr)

In [56]:
accSVM = [[] for _ in range(4)]

trainTime = [[] for _ in range(4)]
predTime = [[] for _ in range(4)]

acc_i = [None for _ in range(4)]
svmModels = [None for _ in range(4)]
yts_hat = [None for _ in range(4)]

predictedTeams = [None for _ in range(4)]
sortedPlayPred = [None for _ in range(4)]

eastPredictTeams = [None for _ in range(4)]
westPredictTeams = [None for _ in range(4)]

for i, year in enumerate(play_years[1:]):
    Xts = np.array(teamsDict[year])
    yts = deepcopy(Xts[:,0])
    Xts =  preprocessing.scale(Xts[:,1:])
    
    playOffIndices = np.where(np.array(yts) == True)
    
    westTeams = list(np.where(teams[year]['Conference'] == 'West')[0])
    westTeamNames = list(np.array(teams[year]['team'])[westTeams])

    temp = []
    westPlayOffIndices = list(playOffIndices[:][0])
    for j in westPlayOffIndices:
        if j in westTeams:
            temp.append(j)
    westPlayOffIndices = temp[:]
    westPlayoffTeams = np.array(teams[year]['team'])[westPlayOffIndices]
    
    eastTeams = list(np.where(teams[year]['Conference'] == 'East')[0])
    eastTeamNames = list(np.array(teams[year]['team'])[eastTeams])
    
    temp = []
    eastPlayOffIndices = list(playOffIndices[:][0])
    for j in eastPlayOffIndices:
        if j in eastTeams:
            temp.append(j)
    eastPlayOffIndices = temp[:]
    eastPlayoffTeams = np.array(teams[year]['team'])[eastPlayOffIndices]
    
    for j in range(4):
        if j is 0:
            k = 'linear'
        elif j is 1:
            k = 'rbf'
        elif j is 2:
            k = 'poly'
        else:
            k = 'sigmoid'
        
        svmModels[j] = svm.SVC(kernel = k, C = C_max[j])
        startTime = time()
        svmModels[j].fit( trainingFeatureSets[i], trainingTargetSets[i] )
        trainTime[j].append((time() - startTime)*1e3)
        
        startTime = time()
        yts_hat[j] = svmModels[j].predict(Xts)
        predTime[j].append((time() - startTime)*1e3)
        
        sortedPlayPred[j] = np.argsort(yts_hat[j][:])
        predictedTeams[j] = np.array(teams[year]['team'])[sortedPlayPred[j][-1::-1]]
        
        eastPredictTeams[j], westPredictTeams[j] = sepPredTeamsToEastAndWest(predictedTeams[j], eastTeamNames, westTeamNames)
        
        acc_i[j] = computeAccForPredict(eastPredictTeams[j]\
                                   , eastPlayoffTeams\
                                   , westPredictTeams[j]\
                                   , westPlayoffTeams)
        
        accSVM[j].append( round(acc_i[j]*100,2) )

In [57]:
trainYears = []
testYears = []

for curr in play_years[1:]:
    trainYears.append( str(2001) + '-' + str(2000 + curr) )
    testYears.append( str(2000 + curr) + '-' + str(2001 + curr) )

trainYears = np.transpose(np.array([trainYears]))
testYears = np.transpose(np.array([testYears]))

SVMaccArr = []
trainTimeArr = []
predTimeArr = []

for i in range(len(accSVM)):
    SVMaccArr.append( np.transpose(np.array([accSVM[i]])) )
    trainTimeArr.append( np.transpose(np.array([trainTime[i]])) )
    predTimeArr.append( np.transpose(np.array([predTime[i]])) )


In [58]:
# totAccTable = FF.create_table( pd.DataFrame( np.hstack((trainYears, testYears, acc_rbfArr, acc_polyArr))\
#                                             , columns = ['Train Years', 'Test NBA Season'\
#                                                          , 'rbf SVM Predict Acc.'\
#                                                          , 'poly SVM Predict Acc.']) )

# totAccTable.layout.width = 700
# totAccTable.layout.margin.update({'t':75, 'l':50})
# totAccTable.layout.update({'title': 'Accuracy of Progressively-Trained SVM Model Over Each NBA Season'})
# py.iplot(totAccTable, filename = 'progAccSVMTable')

accTable = pd.DataFrame( np.hstack((trainYears, testYears, SVMaccArr[0], SVMaccArr[1], SVMaccArr[2], SVMaccArr[3]))\
                                            , columns = ['Train Years', 'Test NBA Season'\
                                                         , 'linear SVM Predict Acc.'\
                                                         , 'rbf SVM Predict Acc.'\
                                                         , 'poly SVM Predict Acc.'\
                                                         , 'sigmoid SVM Predict Acc.'])

trainTable = pd.DataFrame( np.hstack((testYears, trainTimeArr[0], trainTimeArr[1], trainTimeArr[2], trainTimeArr[3]))\
                                            , columns = ['Test Season'\
                                                         , '$linear$ (msec.)'\
                                                         , '$rbf$ (msec.)'\
                                                         , '$poly$ (msec.)'\
                                                         , '$sigmoid$ (msec.)'])

predTable = pd.DataFrame( np.hstack((testYears, predTimeArr[0], predTimeArr[1], predTimeArr[2], predTimeArr[3]))\
                                            , columns = ['Test Season'\
                                                         , '$linear$ (msec.)'\
                                                         , '$rbf$ (msec.)'\
                                                         , '$poly$ (msec.)'\
                                                         , '$sigmoid$ (msec.)'])

In [59]:
accTable.to_csv('svmPredictAccuracy.csv', encoding = 'utf-8', index = False)
trainTable.to_csv('svmTrainTable.csv', encoding = 'utf-8', index = False)
predTable.to_csv('svmPredTable.csv', encoding = 'utf-8', index = False)

accTable

Unnamed: 0,Train Years,Test NBA Season,linear SVM Predict Acc.,rbf SVM Predict Acc.,poly SVM Predict Acc.,sigmoid SVM Predict Acc.
0,2001-2002,2002-2003,81.25,87.5,56.25,62.5
1,2001-2003,2003-2004,81.25,75.0,75.0,62.5
2,2001-2004,2004-2005,75.0,62.5,75.0,68.75
3,2001-2005,2005-2006,75.0,68.75,62.5,75.0
4,2001-2006,2006-2007,75.0,62.5,62.5,75.0
5,2001-2007,2007-2008,62.5,75.0,62.5,75.0
6,2001-2008,2008-2009,68.75,62.5,62.5,75.0
7,2001-2009,2009-2010,81.25,62.5,68.75,68.75
8,2001-2010,2010-2011,62.5,56.25,50.0,75.0
9,2001-2011,2011-2012,75.0,75.0,56.25,81.25


In [60]:
for i in range(len(accSVM)):
    print(len(accSVM[i]))

14
14
14
14


## Sub-process V: Random Forest Classification
### Focus I: Reformat Entire Dataset for Total Training
After our analysis of the support vector machine (SVM) classifier, we move forward to analyzing the accuracy of classifications using random forests or random decision forests for classification.

Random forests operate by constructing a multitude of decision tress at training time and outputting the class that is the mode of the classes. Rnadom decision forests correct for decision trees' habit of overfitting to their training set.

First, we reformat the Entire Dataset for total model training across the dataset to analyze the best regularization strength, $C$.

In [61]:
flag = False
for year in play_years:
    if not flag:
        flag = True
        teamNames = np.transpose( np.array( [teams[year]['team']] ) )
        continue
    teamNames = np.vstack( (teamNames, np.transpose( np.array( [teams[year]['team']] ) ) ) )
X = np.hstack( ( teamNames, np.array(deepcopy(allTeamHist)) ) )
X = pd.DataFrame(X, columns = ["Team"] + columns)

del X['Team']
del X['Playoff']
del X['REB%']
del X['PF']
del X['TO']
del X['OR']
del X['FT%']
del X['Conference']

y = list( deepcopy(allTeamHist['Playoff']) )
Xs = preprocessing.scale(np.array(X))


Data with input dtype object was converted to float64 by the scale function.



## Sub-process V: Random Forest Classification
### Focus II: Intialize Random Forest Model Instance and Test Seperate Tree Counts on Entire Datatset Prediction
Afterwards, we intialize a random forest model instance and test our classifier across several numbers of trees to use.

In [62]:
num_est_vals = np.logspace(1,3,20)
num_est_vals = num_est_vals.ravel().tolist()
for i in range(len(num_est_vals)):
    num_est_vals[i] = int(num_est_vals[i])

acc = []

acc_min = 0

flag = False
for num_est in num_est_vals:
    if not flag:
        flag = True
        est_min = num_est
        
    rfModel = RandomForestClassifier(n_estimators = num_est)
    
    rfModel.fit(Xs,y)
    
    y_hat = rfModel.predict(Xs)
    
    acc_i = round( np.mean(y_hat == y)*100, 2)
    
    acc.append(acc_i)
    
    if (acc_min < acc_i):
        acc_min = acc_i
        est_min = num_est

## Sub-process V: Random Forest Classification
### Focus III: Tabulate & Analyze Entire Dataset Training Errors

Next, we tabulate our results and analyze the errors based on the tree numbers on each of our Random Forest classifier models.

In [63]:
num_est_valsArr = np.transpose( np.array([num_est_vals]))

RFaccArr = np.transpose( np.array([acc]) )

In [64]:
# totAccTable = FF.create_table( pd.DataFrame( np.hstack((num_est_valsArr, accArr))\
#                                                    , columns = ['Number of Estimators (Trees)', 'Accuracy (%)']) )

# totAccTable.layout.width = 500
# totAccTable.layout.margin.update({'t':75, 'l':50})
# totAccTable.layout.update({'title': 'Random Forest Classifier Error Rates'})
# py.iplot(totAccTable, filename = 'overRandomForestTable')

accTable = pd.DataFrame( np.hstack((num_est_valsArr, RFaccArr))\
                                                   , columns = ['Number of Estimators (Trees)', 'Accuracy (%)'])

accTable

Unnamed: 0,Number of Estimators (Trees),Accuracy (%)
0,10.0,99.33
1,12.0,99.55
2,16.0,99.78
3,20.0,99.78
4,26.0,99.55
5,33.0,100.0
6,42.0,100.0
7,54.0,100.0
8,69.0,100.0
9,88.0,100.0


In [65]:
accTable.to_csv("rfTreeAcc.csv", encoding = 'utf-8', index = False)

In [66]:
n_max = est_min
n_max

33

## Sub-process V: Random Forest Classification
### Focus IV: Use Random Forest Models with Minimal Errors to Progressively Predict Throughout NBA Seasons

In [67]:
def compileTrainData(teamsDict, year):
    flag = False
    for i in range(1,year+1):
        if not flag:
            flag = True
            Xtr = np.array(teamsDict[i])
            continue
        Xtr = np.vstack( (Xtr, np.array(teamsDict[i]) ) )
    return Xtr
    
trainingFeatureSets = []
trainingTargetSets = []
for year in play_years[:-1]:
    Xtr = compileTrainData(teamsDict, year)
    ytr = deepcopy(Xtr[:,0])
    Xtr = Xtr[:,1:]
    
    trainingFeatureSets.append(preprocessing.scale(Xtr))
    trainingTargetSets.append(ytr)

In [68]:
accRF = []
trainTime = []
predTime = []

for i, year in enumerate(play_years[1:]):
    Xts = np.array(teamsDict[year])
    yts = deepcopy(Xts[:,0])
    Xts =  preprocessing.scale(Xts[:,1:])
    
    rfModel = RandomForestClassifier(est_min)
    
    startTime = time()
    rfModel.fit( trainingFeatureSets[i], trainingTargetSets[i] )
    trainTime.append( round( (time() - startTime)*1e3, 2 ) )
    
    startTime = time()
    yts_hat = rfModel.predict_proba(Xts)
    predTime.append( round( (time() - startTime)*1e3, 2 ) )
    
    playOffIndices = np.where(np.array(yts) == True)
    
    westTeams = list(np.where(teams[year]['Conference'] == 'West')[0])
    westTeamNames = list(np.array(teams[year]['team'])[westTeams])

    temp = []
    westPlayOffIndices = list(playOffIndices[:][0])
    for j in westPlayOffIndices:
        if j in westTeams:
            temp.append(j)
    westPlayOffIndices = temp[:]
    westPlayoffTeams = np.array(teams[year]['team'])[westPlayOffIndices]
    
    eastTeams = list(np.where(teams[year]['Conference'] == 'East')[0])
    eastTeamNames = list(np.array(teams[year]['team'])[eastTeams])
    
    temp = []
    eastPlayOffIndices = list(playOffIndices[:][0])
    for j in eastPlayOffIndices:
        if j in eastTeams:
            temp.append(j)
    eastPlayOffIndices = temp[:]
    eastPlayoffTeams = np.array(teams[year]['team'])[eastPlayOffIndices]
    
    sortedPlayPred = np.argsort(yts_hat[:,1])
    predictedTeams = np.array(teams[year]['team'])[sortedPlayPred[-1::-1]]
    eastPredictTeams, westPredictTeams =\
    sepPredTeamsToEastAndWest(predictedTeams, eastTeamNames, westTeamNames)
    
    acc_i = computeAccForPredict(eastPredictTeams\
                                   , eastPlayoffTeams\
                                   , westPredictTeams\
                                   , westPlayoffTeams)
    
    accRF.append(round( acc_i*100, 2 ))

In [69]:
cPickle.dump(rfModel, open("rfModel.p", "wb"))

In [70]:
trainYears = []
testYears = []

for curr in play_years[1:]:
    trainYears.append( str(2001) + '-' + str(2000 + curr) )
    testYears.append( str(2000 + curr) + '-' + str(2001 + curr) )

trainYears = np.transpose(np.array([trainYears]))
testYears = np.transpose(np.array([testYears]))

acc_RFArr = np.transpose( np.array([accRF]) )
trainTime = np.transpose( np.array([trainTime]) )
predTime = np.transpose( np.array([predTime]) )

In [71]:
# totAccTable = FF.create_table( pd.DataFrame( np.hstack((trainYears, testYears, acc_RFArr))\
#                                             , columns = ['Train Years', 'Test NBA Season'\
#                                                          , 'Random Forest Accuracy']) )

# totAccTable.layout.width = 750
# totAccTable.layout.margin.update({'t':75, 'l':50})
# totAccTable.layout.update({'title': 'Accuracy of Progressively-Trained Random Forest Model Over Each NBA Season'})
# py.iplot(totAccTable, filename = 'progAccRandomForestTable')

accTable = pd.DataFrame( np.hstack((trainYears, testYears, acc_RFArr, trainTime, predTime))\
                                            , columns = ['Train Years', 'Test Season'\
                                                         , 'RF Acc. %'\
                                                         , 'Train. Time (msec.)'\
                                                         , 'Predict. Time (msec.)'])

accTable

Unnamed: 0,Train Years,Test Season,RF Acc. %,Train. Time (msec.),Predict. Time (msec.)
0,2001-2002,2002-2003,81.25,84.27,12.87
1,2001-2003,2003-2004,75.0,58.29,2.2
2,2001-2004,2004-2005,75.0,36.83,2.3
3,2001-2005,2005-2006,68.75,33.54,2.39
4,2001-2006,2006-2007,68.75,36.68,2.14
5,2001-2007,2007-2008,75.0,38.49,2.16
6,2001-2008,2008-2009,75.0,44.5,2.63
7,2001-2009,2009-2010,68.75,46.8,2.39
8,2001-2010,2010-2011,81.25,39.76,2.16
9,2001-2011,2011-2012,81.25,43.77,2.47


In [72]:
accTable.to_csv("RFpredictAccuracy.csv", encoding = 'utf-8', index = False)

In [73]:
print(len(accRF))

14


## Sub-process VI: Decision Tree Classification
### Focus: Intialize Decision Tree Model Instance and Predict Outcomes
Afterwards, we intialize a decision tree model instance and test our classifier across several NBA seasons

In [74]:
def compileTrainData(teamsDict, year):
    flag = False
    for i in range(1,year+1):
        if not flag:
            flag = True
            Xtr = np.array(teamsDict[i])
            continue
        Xtr = np.vstack( (Xtr, np.array(teamsDict[i]) ) )
    return Xtr
    
trainingFeatureSets = []
trainingTargetSets = []
for year in play_years[:-1]:
    Xtr = compileTrainData(teamsDict, year)
    ytr = deepcopy(Xtr[:,0])
    Xtr = Xtr[:,1:]
    
    trainingFeatureSets.append(preprocessing.scale(Xtr))
    trainingTargetSets.append(ytr)

In [75]:
accTree = []
trainTime = []
predTime = []
for i, year in enumerate(play_years[1:]):
    Xts = np.array(teamsDict[year])
    yts = deepcopy(Xts[:,0])
    Xts =  preprocessing.scale(Xts[:,1:])
    
    treeModel = DecisionTreeClassifier()
    
    startTime = time()
    treeModel.fit( trainingFeatureSets[i], trainingTargetSets[i] )
    trainTime.append( round( (time() - startTime)*1e3, 2) )
    
    startTime = time()
    yts_hat = treeModel.predict_proba(Xts)
    predTime.append( round( (time() - startTime)*1e3, 2) )
    
    playOffIndices = np.where(np.array(yts) == True)
    
    westTeams = list(np.where(teams[year]['Conference'] == 'West')[0])
    westTeamNames = list(np.array(teams[year]['team'])[westTeams])

    temp = []
    westPlayOffIndices = list(playOffIndices[:][0])
    for j in westPlayOffIndices:
        if j in westTeams:
            temp.append(j)
    westPlayOffIndices = temp[:]
    westPlayoffTeams = np.array(teams[year]['team'])[westPlayOffIndices]
    
    eastTeams = list(np.where(teams[year]['Conference'] == 'East')[0])
    eastTeamNames = list(np.array(teams[year]['team'])[eastTeams])
    
    temp = []
    eastPlayOffIndices = list(playOffIndices[:][0])
    for j in eastPlayOffIndices:
        if j in eastTeams:
            temp.append(j)
    eastPlayOffIndices = temp[:]
    eastPlayoffTeams = np.array(teams[year]['team'])[eastPlayOffIndices]
    
    sortedPlayPred = np.argsort(yts_hat[:,1])
    predictedTeams = np.array(teams[year]['team'])[sortedPlayPred[-1::-1]]
    eastPredictTeams, westPredictTeams =\
    sepPredTeamsToEastAndWest(predictedTeams, eastTeamNames, westTeamNames)
    
    acc_i = computeAccForPredict(eastPredictTeams\
                                   , eastPlayoffTeams\
                                   , westPredictTeams\
                                   , westPlayoffTeams)
    
    accTree.append(round(acc_i*100,2))

In [76]:
cPickle.dump(treeModel, open("treeModel.p", "wb"))

In [77]:
trainYears = []
testYears = []

for curr in play_years[1:]:
    trainYears.append( str(2001) + '-' + str(2000 + curr) )
    testYears.append( str(2000 + curr) + '-' + str(2001 + curr) )

trainYears = np.transpose(np.array([trainYears]))
testYears = np.transpose(np.array([testYears]))

acc_TreeArr = np.transpose( np.array([accTree]) )
trainTime = np.transpose( np.array([trainTime]) )
predTime = np.transpose( np.array([predTime]) )

In [78]:
# totAccTable = FF.create_table( pd.DataFrame( np.hstack((trainYears, testYears, acc_TreeArr))\
#                                             , columns = ['Train Years', 'Test NBA Season'\
#                                                          , 'Decision Tree Accuracy']) )

# totAccTable.layout.width = 750
# totAccTable.layout.margin.update({'t':75, 'l':50})
# totAccTable.layout.update({'title': 'Accuracy of Progressively-Trained Decision Tree Model Over Each NBA Season'})
# py.iplot(totAccTable, filename = 'progAccDecisionTreeTable')

accTable = pd.DataFrame( np.hstack((trainYears, testYears, acc_TreeArr, trainTime, predTime))\
                                            , columns = ['Train Years', 'Test Season'\
                                                         , 'DT Acc., %'\
                                                         , 'Train Time (msec.)'\
                                                         , 'Predict Time (msec.)'])

accTable

Unnamed: 0,Train Years,Test Season,"DT Acc., %",Train Time (msec.),Predict Time (msec.)
0,2001-2002,2002-2003,81.25,0.5,0.15
1,2001-2003,2003-2004,75.0,0.74,0.16
2,2001-2004,2004-2005,75.0,0.91,0.18
3,2001-2005,2005-2006,62.5,0.95,0.16
4,2001-2006,2006-2007,68.75,1.41,0.22
5,2001-2007,2007-2008,68.75,1.6,0.18
6,2001-2008,2008-2009,68.75,1.81,0.16
7,2001-2009,2009-2010,68.75,2.06,0.2
8,2001-2010,2010-2011,75.0,2.39,0.22
9,2001-2011,2011-2012,68.75,2.57,0.13


In [79]:
accTable.to_csv("decTreePredictAccuracy.csv", encoding = 'utf-8', index = False)
accTable.columns

Index(['Train Years', 'Test Season', 'DT Acc., %', 'Train Time (msec.)',
       'Predict Time (msec.)'],
      dtype='object')

In [80]:
print(len(accTree))

14


## Conclusion for NBA-Playoff Predictions

Based on our results for the earlier seasons in the NBA, it makes sense to use the logistic regression classifiers for predicting NBA-Playoff contention, however we see that as we obtain more training data and try to predict some of the more recent NBA-Playoff contenders, the random forest classifier model's accuracy begins to level with that of the logistic regression classifiers. Therefore, we hypothesize that as we continue to obtain more data for the coming NBA seasons, the random forest classifier and logistic classifiers will continue to be the most accurate of the classifier models that we analyze in this project.

In [81]:
totalAcc = [accLog[0], accLog[1], accLog[2], accSVM[0], accSVM[1], accSVM[2], accSVM[3], accRF, accTree]
names = ['l1-penalized logistic', 'l2-penalized logistic',
         'unregularized logistic', 'linear SVM',
         'radial basis SVM', ' polynomial SVM', 
         'sigmoid SVM', 'random forest', 'decision tree']

In [82]:
data = []

predictYears = []
for i in play_years[1:]:
    predictYears.append( str(2000+i) + '-' + str(2000+i+1) )
for currAcc, name in zip(totalAcc, names):
    print(len(currAcc))
    trace = go.Bar(
                x = currAcc,
                y = predictYears,
                orientation = 'h',
                name = name
    )
    data.append(trace)

14
14
14
14
14
14
14
14
14


In [83]:
layout = go.Layout(
            title = 'Progressive Classifier Model Accuracies Throughout NBA Seasons',
            autosize = True,
            width = 1000,
            height = 1000)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'All Model Acccuracies Throughout NBA Seasons')
allAccPlot

In [84]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[0],
                x = predictYears[0],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2002, Predict: 2002-2003)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2002, Predict: 2002-2003)')
allAccPlot

In [85]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[1],
                x = predictYears[1],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2003, Predict: 2003-2004)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2003, Predict: 2003-2004)')
allAccPlot

In [86]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[2],
                x = predictYears[2],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2004, Predict: 2004-2005)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2004, Predict: 2004-2005)')
allAccPlot

In [87]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[3],
                x = predictYears[3],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2005, Predict: 2005-2006)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2005, Predict: 2005-2006)')
allAccPlot

In [88]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[4],
                x = predictYears[4],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2006, Predict: 2006-2007)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2006, Predict: 2006-2007)')
allAccPlot

In [89]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[5],
                x = predictYears[5],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2007, Predict: 2007-2008)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2007, Predict: 2007-2008)')
allAccPlot

In [90]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[6],
                x = predictYears[6],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2008, Predict: 2008-2009)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2008, Predict: 2008-2009)')
allAccPlot

In [91]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[7],
                x = predictYears[7],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2009, Predict: 2009-2010)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2009, Predict: 2009-2010)')
allAccPlot

In [92]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[8],
                x = predictYears[8],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2010, Predict: 2010-2011)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2010, Predict: 2010-2011)')
allAccPlot

In [93]:
data = []
for currAcc, name in zip(totalAcc, names):
    print(currAcc[9])
    trace = go.Bar(
                y = currAcc[9],
                x = predictYears[9],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2011, Predict: 2011-2012)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2011, Predict: 2011-2012)')
allAccPlot

81.25
81.25
81.25
75.0
75.0
56.25
81.25
81.25
68.75


In [94]:
data = []
for currAcc, name in zip(totalAcc, names):
    trace = go.Bar(
                y = currAcc[10],
                x = predictYears[10],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2012, Predict: 2012-2013)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2012, Predict: 2012-2013)')
allAccPlot

In [95]:
data = []
for currAcc, name in zip(totalAcc, names):
    print(currAcc[9])
    trace = go.Bar(
                y = currAcc[11],
                x = predictYears[11],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2013, Predict: 2013-2014)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2013, Predict: 2013-2014)')
allAccPlot

81.25
81.25
81.25
75.0
75.0
56.25
81.25
81.25
68.75


In [96]:
data = []
for currAcc, name in zip(totalAcc, names):
    print(currAcc[9])
    trace = go.Bar(
                y = currAcc[12],
                x = predictYears[12],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2014, Predict: 2014-2015)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2014, Predict: 2014-2015)')
allAccPlot

81.25
81.25
81.25
75.0
75.0
56.25
81.25
81.25
68.75


In [97]:
data = []
for currAcc, name in zip(totalAcc, names):
    print(currAcc[9])
    trace = go.Bar(
                y = currAcc[13],
                x = predictYears[13],
                name = name
    )
    data.append(trace)
layout = go.Layout(
            title = 'Classifier Model Accuracies (Train: 2001-2015, Predict: 2015-2016)',
            autosize = True,
            width = 700,
            height = 900)
fig = go.Figure(data = data, layout = layout)

allAccPlot = py.iplot(fig, filename = 'Classifier Model Accuracies (Train: 2001-2015, Predict: 2015-2016)')
allAccPlot

81.25
81.25
81.25
75.0
75.0
56.25
81.25
81.25
68.75


In [98]:
accMeans = []
accVar = []
for i in totalAcc:
    accMeans.append(np.mean(i))
    accVar.append(np.var(i))
    
accMeans = np.argsort(accMeans)[-1::-1]
accVar = np.argsort(accVar)

accMeans = accMeans.ravel().tolist()

for i in accMeans:
    print(names[i])
print()
for i in accVar:
    print(names[i])

l1-penalized logistic
unregularized logistic
l2-penalized logistic
random forest
sigmoid SVM
linear SVM
decision tree
radial basis SVM
 polynomial SVM

random forest
decision tree
sigmoid SVM
l2-penalized logistic
unregularized logistic
linear SVM
l1-penalized logistic
 polynomial SVM
radial basis SVM


In [105]:
dataTable = pd.DataFrame(totalAcc)
dataTable.head(len(dataTable))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,87.5,75.0,81.25,68.75,75.0,75.0,75.0,68.75,75.0,81.25,81.25,75.0,87.5,62.5
1,81.25,75.0,75.0,68.75,81.25,68.75,75.0,68.75,75.0,81.25,81.25,81.25,81.25,62.5
2,81.25,75.0,75.0,68.75,81.25,68.75,75.0,68.75,75.0,81.25,81.25,81.25,81.25,62.5
3,81.25,81.25,75.0,75.0,75.0,62.5,68.75,81.25,62.5,75.0,68.75,68.75,75.0,62.5
4,87.5,75.0,62.5,68.75,62.5,75.0,62.5,62.5,56.25,75.0,62.5,50.0,62.5,56.25
5,56.25,75.0,75.0,62.5,62.5,62.5,62.5,68.75,50.0,56.25,62.5,43.75,68.75,50.0
6,62.5,62.5,68.75,75.0,75.0,75.0,75.0,68.75,75.0,81.25,75.0,68.75,81.25,75.0
7,81.25,75.0,75.0,68.75,68.75,75.0,75.0,68.75,81.25,81.25,81.25,68.75,75.0,75.0
8,81.25,75.0,75.0,62.5,68.75,68.75,68.75,68.75,75.0,68.75,68.75,68.75,62.5,68.75
