# NBA PREDICTIVE MODELLING

#### Resources
Predicting Results for Professional Basketball Using NBA API Data (2016)
http://cs229.stanford.edu/proj2016/report/PerriconeShawSwiechowicz-PredictingResultsforProfessionalBasketballUsingNBAAPIData.pdf

#### Predicting NBA games using neural networks (2009)
http://www.perducosports.com/media/NBA_Article.pdf

## 1. Build a Historical data set with the help of NBA_PY

#### Season name:
Format: NNNN-NN (eg. 1995-96)
#### Game ID:
Format: 002"YY-1""Game Number" (eg. 0021600001, 2017 Season First Game)
#### SeasonType:
One of: "Regular Season", "Pre Season", "Playoffs", "All-Star", "All Star", "Preseason"

In [2]:
import nba_py
import nba_py.game as api_game
import nba_py.league as api_league
import nba_py.team as api_team
import nba_py.player as api_player
# from nba_py import constants
import pandas as pd
import time
import numpy as np
from IPython.display import display, HTML
import pyarrow

## Get team level stats, for each season:

In [33]:
nba_py.constants.TEAMS

{'ATL': {'abbr': 'ATL',
  'city': 'Atlanta',
  'code': 'hawks',
  'color': 'E2373E',
  'colors': ['E2373E', '002A5C', 'BAC4CA'],
  'conference': 'Eastern',
  'displayAbbr': 'ATL',
  'displayConference': 'Eastern',
  'division': 'Southeast',
  'id': '1610612737',
  'name': 'Hawks'},
 'BKN': {'abbr': 'BKN',
  'city': 'Brooklyn',
  'code': 'nets',
  'color': '000000',
  'colors': ['000000', 'FFFFFF'],
  'conference': 'Eastern',
  'displayAbbr': 'BKN',
  'displayConference': 'Eastern',
  'division': 'Atlantic',
  'id': '1610612751',
  'name': 'Nets'},
 'BOS': {'abbr': 'BOS',
  'city': 'Boston',
  'code': 'celtics',
  'color': '007239',
  'colors': ['007239', 'AE8445', '982527', '000000'],
  'conference': 'Eastern',
  'displayAbbr': 'BOS',
  'displayConference': 'Eastern',
  'division': 'Atlantic',
  'id': '1610612738',
  'name': 'Celtics'},
 'CHA': {'abbr': 'CHA',
  'city': 'Charlotte',
  'code': 'hornets',
  'color': '00848E',
  'colors': ['00848E', '260F54', 'CCCCCC'],
  'conference': 'E

In [36]:
def get_team_ids():
    '''
    Get unique team identifier. Will be used as primary identifier in future work.
    '''
    ids = [nba_py.constants.TEAMS[team]['id'] for team in nba_py.constants.TEAMS]
    ids_dict = {nba_py.constants.TEAMS[team]['id']: i  for i, team in enumerate(nba_py.constants.TEAMS)}
    print(ids_dict)
    return [nba_py.constants.TEAMS[team]['id'] for team in nba_py.constants.TEAMS]

def get_team_ids_dict():
    '''
    Get unique team ids dictionary for 1 hot encoding
    '''
    ids = [nba_py.constants.TEAMS[team]['id'] for team in nba_py.constants.TEAMS]
    return {nba_py.constants.TEAMS[team]['id']: i  for i, team in enumerate(nba_py.constants.TEAMS)}


def get_team_name_dict():
    '''
    Get unique team ids dictionary for getting names
    '''
    return {nba_py.constants.TEAMS[team]['id']:nba_py.constants.TEAMS[team]['name']
            for i, team in enumerate(nba_py.constants.TEAMS)}


def get_seasons_list(first, last):
    '''
    Get season list in Format: NNNN-NN (eg. 1995-96)
    '''
    
    seasons = []
    for year in range(first,last):
        seasons.append('{}-{}'.format(str(year), str(year+1)[2:]))
    return seasons


def get_season_lengths(seasons, team_ids):
    '''
    Get season length by iterating through each teams schedule. 
    Probably a better way but can't find it in api.
    '''
    
    season_lengths = {}
    for season in seasons:
        game_ids = []
        for team_count, team in enumerate(team_ids):
            team_games = api_team.TeamGameLogs(season=season,team_id=team)

            if team_count==0:
                game_ids = list(team_games.info().Game_ID.values)
            else:
                game_ids += list(team_games.info().Game_ID.values)
            time.sleep(1)
            
        season_lengths[season] = len(set(game_ids))
        print(season, len(set(game_ids)))
    return season_lengths

# team_ids = get_team_ids()
# seasons = get_seasons_list(2000, 2018)
# season_lengths = get_season_lengths(seasons, team_ids)

In [37]:
get_team_name_dict()

{'1610612737': 'Hawks',
 '1610612738': 'Celtics',
 '1610612739': 'Cavaliers',
 '1610612740': 'Pelicans',
 '1610612741': 'Bulls',
 '1610612742': 'Mavericks',
 '1610612743': 'Nuggets',
 '1610612744': 'Warriors',
 '1610612745': 'Rockets',
 '1610612746': 'Clippers',
 '1610612747': 'Lakers',
 '1610612748': 'Heat',
 '1610612749': 'Bucks',
 '1610612750': 'Timberwolves',
 '1610612751': 'Nets',
 '1610612752': 'Knicks',
 '1610612753': 'Magic',
 '1610612754': 'Pacers',
 '1610612755': 'Sixers',
 '1610612756': 'Suns',
 '1610612757': 'Trail Blazers',
 '1610612758': 'Kings',
 '1610612759': 'Spurs',
 '1610612760': 'Thunder',
 '1610612761': 'Raptors',
 '1610612762': 'Jazz',
 '1610612763': 'Grizzlies',
 '1610612764': 'Wizards',
 '1610612765': 'Pistons',
 '1610612766': 'Hornets'}

In [18]:
season_lengths = {'2000-01': 1189,
'2001-02':  1189,
'2002-03':  1189,
'2003-04':  1189,
'2004-05':  1230,
'2005-06':  1230,
'2006-07':  1230,
'2007-08':  1230,
'2008-09':  1230,
'2009-10':  1230,
'2010-11':  1230,
'2011-12':  990,
'2012-13':  1230,
'2013-14':  1230,
'2014-15':  1230,
'2015-16':  1230,
'2016-17':  1230,
'2017-18': 1230}

SyntaxError: invalid syntax (<ipython-input-18-5190236f6838>, line 1)

In [20]:
season_lengths

{'2000-01': 1189,
 '2001-02': 1189,
 '2002-03': 1189,
 '2003-04': 1189,
 '2004-05': 1230,
 '2005-06': 1230,
 '2006-07': 1230,
 '2007-08': 1230,
 '2008-09': 1230,
 '2009-10': 1230,
 '2010-11': 1230,
 '2011-12': 990,
 '2012-13': 1230,
 '2013-14': 1230,
 '2014-15': 1230,
 '2015-16': 1230,
 '2016-17': 1230,
 '2017-18': 1230}

In [24]:
def get_game_features(game_id, sleep_between_requests=0.2):
    '''
    For a given game id combine all the (useful) boxscore data available. 
    Dump to parquet for each season
    '''
    
    boxscore_keep = ['GAME_ID', 'TEAM_ID', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
                     'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']
    boxscore_adv_keep = ['GAME_ID', 'TEAM_ID', 'OFF_RATING', 'DEF_RATING', 'NET_RATING', 
                         'AST_PCT', 'AST_TOV', 'AST_RATIO', 'DREB_PCT', 'REB_PCT', 
                         'TS_PCT', 'USG_PCT', 'PACE', 'PIE']
    boxscore_4f_keep = ['GAME_ID', 'TEAM_ID', 'EFG_PCT', 'FTA_RATE', 'TM_TOV_PCT', 
                        'OREB_PCT', 'OPP_EFG_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT']
    boxscore_misc_keep = ['GAME_ID', 'TEAM_ID', 'PTS_OFF_TOV', 'PTS_2ND_CHANCE', 'PTS_FB', 'PTS_PAINT',
                          'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB', 'OPP_PTS_PAINT',
                          'BLK', 'BLKA', 'PF', 'PFD']
    boxscore_score_keep = ['GAME_ID', 'TEAM_ID', 'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_2PT', 
                           'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 'PCT_PTS_FB', 'PCT_PTS_FT', 'PCT_PTS_OFF_TOV',
                           'PCT_PTS_PAINT', 'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM',
                           'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM']

    game_summary_keep = ['GAME_ID', 'GAME_DATE_EST', 'GAME_SEQUENCE', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID']
    line_score_keep = ['GAME_ID', 'TEAM_ID', 'PTS_QTR1', 'PTS_QTR2', 'PTS_QTR3', 'PTS_QTR4', 'PTS']
    other_stats_keep  = ['TEAM_ID', 'LARGEST_LEAD', 'LEAD_CHANGES']
    
    boxscore = api_game.Boxscore(game_id).team_stats()[boxscore_keep]
    time.sleep(sleep_between_requests)
    
    boxscore_adv = api_game.BoxscoreAdvanced(game_id).sql_team_advanced()[boxscore_adv_keep]
    time.sleep(sleep_between_requests)
    
    boxscore_4f = api_game.BoxscoreFourFactors(game_id).sql_team_four_factors()[boxscore_4f_keep]
    time.sleep(sleep_between_requests)
    
    boxscore_misc = api_game.BoxscoreMisc(game_id).sql_team_misc()[boxscore_misc_keep]
    time.sleep(sleep_between_requests)
    
    boxscore_score = api_game.BoxscoreScoring(game_id).sql_team_scoring()[boxscore_score_keep]
    time.sleep(sleep_between_requests)
    
    summary = api_game.BoxscoreSummary(game_id)
    game_summary = summary.game_summary()[game_summary_keep]
    line_score = summary.line_score()[line_score_keep]
    other_stats = summary.other_stats()[other_stats_keep]

    boxscore_combined = boxscore_4f.merge(boxscore_adv, on=['GAME_ID', 'TEAM_ID']) 
    boxscore_combined = boxscore_combined.merge(boxscore_misc, on=['GAME_ID', 'TEAM_ID']) 
    boxscore_combined = boxscore_combined.merge(boxscore_score, on=['GAME_ID', 'TEAM_ID']) 
    boxscore_combined = boxscore_combined.merge(line_score, on=['GAME_ID', 'TEAM_ID']) 
    boxscore_combined = boxscore_combined.merge(game_summary, on=['GAME_ID']) 
    boxscore_combined = boxscore_combined.merge(other_stats, on=['TEAM_ID']) 
    
    return boxscore_combined



def scrape_season_boxscores(season_lengths, skip_seasons=[]):
    '''
    Use season_lengths dict to collect each season's box scores and save to parquet
    '''
    
    for season in season_lengths.keys():
        if season in skip_seasons:
            print("Skipping " + season)
            
        else:
            boxscore_combined = pd.DataFrame()

            for game in range(1, season_lengths[season]+1):
                gameid = '002{}{}'.format(season[2:4], str(game).zfill(5))
                boxscore_combined = boxscore_combined.append(
                    get_game_features(gameid, sleep_between_requests=0), ignore_index=True)

                if game%10 == 0:
                    print('Up to {}, game {} / {}'.format(season, game, season_lengths[season]))

            boxscore_combined.to_parquet('boxscores_raw_{}.parquet'.format(season))


In [28]:
scrape_season_boxscores({'2002-03': 1189})#, skip_seasons=['2000-01','2001-02','2002-03','2003-04', '2004-05'])
# 04-05 is broken and will need to be fixed by loading from disc

Up to 2002-03, game 10 / 1189
Up to 2002-03, game 20 / 1189
Up to 2002-03, game 30 / 1189
Up to 2002-03, game 40 / 1189
Up to 2002-03, game 50 / 1189
Up to 2002-03, game 60 / 1189
Up to 2002-03, game 70 / 1189
Up to 2002-03, game 80 / 1189
Up to 2002-03, game 90 / 1189
Up to 2002-03, game 100 / 1189
Up to 2002-03, game 110 / 1189
Up to 2002-03, game 120 / 1189
Up to 2002-03, game 130 / 1189
Up to 2002-03, game 140 / 1189
Up to 2002-03, game 150 / 1189
Up to 2002-03, game 160 / 1189
Up to 2002-03, game 170 / 1189
Up to 2002-03, game 180 / 1189
Up to 2002-03, game 190 / 1189
Up to 2002-03, game 200 / 1189
Up to 2002-03, game 210 / 1189
Up to 2002-03, game 220 / 1189
Up to 2002-03, game 230 / 1189
Up to 2002-03, game 240 / 1189
Up to 2002-03, game 250 / 1189
Up to 2002-03, game 260 / 1189
Up to 2002-03, game 270 / 1189
Up to 2002-03, game 280 / 1189
Up to 2002-03, game 290 / 1189
Up to 2002-03, game 300 / 1189
Up to 2002-03, game 310 / 1189
Up to 2002-03, game 320 / 1189
Up to 2002-03, ga

In [6]:
%timeit

## Player level stats:

In [5]:
api_player.get_player('Kevin', 'Durant')

1040    201142
Name: PERSON_ID, dtype: int64

In [None]:
## Take a look at NBA Finals 2017-18
good_cols = ['GAME_DATE_EST', 'TEAM_NICKNAME',
       'TEAM_WINS_LOSSES', 'PTS_QTR1', 'PTS_QTR2', 'PTS_QTR3', 'PTS_QTR4', 'PTS']
game_number = 4
idx = "00217{}".format(str(game_number).zfill(5)) #"0041700404"
print(idx)
boxscore_summary = api.game.BoxscoreSummary(idx)
boxscore_summary.line_score()[good_cols]
# game_stats = game.BoxscoreAdvanced(id1)
# game_stats.sql_team_advanced()

In [13]:
boxscore_summary = api_game.BoxscoreSummary("0021600457")
# print(boxscore_summary.line_score().iloc[:,8:])
print(boxscore_summary.season_series())

      GAME_ID  HOME_TEAM_ID  VISITOR_TEAM_ID        GAME_DATE_EST  \
0  0021600457    1610612739       1610612744  2016-12-25T00:00:00   

   HOME_TEAM_WINS  HOME_TEAM_LOSSES SERIES_LEADER  
0               1                 1          Tied  


In [26]:
boxscore_summary.line_score().iloc[0,5:]

TEAM_CITY_NAME      Cleveland
TEAM_NICKNAME       Cavaliers
TEAM_WINS_LOSSES         23-6
PTS_QTR1                   25
PTS_QTR2                   27
PTS_QTR3                   28
PTS_QTR4                   29
PTS_OT1                     0
PTS_OT2                     0
PTS_OT3                     0
PTS_OT4                     0
PTS_OT5                     0
PTS_OT6                     0
PTS_OT7                     0
PTS_OT8                     0
PTS_OT9                     0
PTS_OT10                    0
PTS                       109
Name: 0, dtype: object