In [1]:
import nba_api
import pandas as pd
import requests
import pickle
import os
import glob

In [2]:
headers  = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [3]:
# import all the necesssary packages
import time
from requests.exceptions import ReadTimeout
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoretraditionalv2
from nba_api.stats.endpoints import boxscoresummaryv2
from nba_api.stats.endpoints import playbyplayv2

# create list of seasons and final dictionary for lineups
seasons = list(range(21996, 22000))
lineup_dict = {}
run = 1

In [4]:
# build API retry function to use for every npa_api call
def api_call_with_retries(api_class, retries=3, delay=10, **kwargs):
    for attempt in range(retries):
        try:
            api_instance = api_class(**kwargs)
            return api_instance
        except ReadTimeout as e:
            print(f"ReadTimeout occurred: {e}")
            if attempt < retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"Failed to retrieve data after {retries} attempts.")
                return None

In [5]:
def time_to_seconds(time_str):
    # Split the time string into minutes and seconds
    minutes, seconds = map(int, time_str.split(':'))
    # Convert the time to total seconds
    total_seconds = minutes * 60 + seconds
    return total_seconds

In [6]:
def get_season(season_id):
    start_year = int(season_id) % 10000
    end_year = (start_year + 1) % 100
    season = f"{start_year}-{end_year:02d}"
    return season

In [7]:
def get_season_games(season):
    gamefinder = api_call_with_retries(leaguegamelog.LeagueGameLog, season=season)
    games = gamefinder.get_data_frames()[0]['GAME_ID']
    games = games.drop_duplicates().sort_values()
    return games

In [8]:
def get_team_ids(game_id):
    teams = api_call_with_retries(boxscoresummaryv2.BoxScoreSummaryV2, game_id=game_id)
    home_team_id = teams.get_data_frames()[0][['HOME_TEAM_ID']].iloc[0].iloc[0]
    away_team_id = teams.get_data_frames()[0][['VISITOR_TEAM_ID']].iloc[0].iloc[0]
    return home_team_id, away_team_id

In [9]:
def get_starting_lineup_1q(game_id, home_team_id, away_team_id):
    box_score = api_call_with_retries(boxscoretraditionalv2.BoxScoreTraditionalV2, game_id=game_id)
    box_score = box_score.get_data_frames()[0]
    starters = box_score[box_score['START_POSITION']!=""][['TEAM_ID','PLAYER_ID']].sort_values(by=['TEAM_ID','PLAYER_ID'])
    home_team_lineup_1q = starters[starters['TEAM_ID']==home_team_id]['PLAYER_ID'].tolist()
    away_team_lineup_1q = starters[starters['TEAM_ID']==away_team_id]['PLAYER_ID'].tolist()
    starting_lineup_1q = home_team_lineup_1q + away_team_lineup_1q
    return starting_lineup_1q

In [10]:
def get_pbp(game_id):
    pbp = api_call_with_retries(playbyplayv2.PlayByPlayV2, game_id=game_id)
    pbp = pbp.get_data_frames()[0]
    return pbp

In [11]:
def get_team_starting_lineup_quarter(pbp_quarter, team_id):
        players = []
        exculde_event_types = [6, 7, 11]
        players.extend(pbp_quarter[(pbp_quarter['PLAYER1_TEAM_ID'] == team_id) & (~pbp_quarter['EVENTMSGTYPE'].isin(exculde_event_types))]['PLAYER1_ID'].tolist())
        players.extend(pbp_quarter[(pbp_quarter['PLAYER2_TEAM_ID'] == team_id) & (~pbp_quarter['EVENTMSGTYPE'].isin(exculde_event_types))]['PLAYER2_ID'].tolist())
        players.extend(pbp_quarter[(pbp_quarter['PLAYER3_TEAM_ID'] == team_id) & (~pbp_quarter['EVENTMSGTYPE'].isin(exculde_event_types))]['PLAYER3_ID'].tolist())
        # unique set of players that did something in the second half for the team
        players = sorted(set(players))
        players_dict = {}
        # loop through each home player and count how many times they were involved and if they were subbed in or out first
        for player in players:
            player_involvements = 0
            for index, row in pbp_quarter.iterrows():
                if (row['PLAYER1_ID']==player) or (row['PLAYER2_ID']==player) or (row['PLAYER3_ID']==player):
                    if row['EVENTMSGTYPE']!=8:
                        player_involvements += 1
            subbed_in = pbp_quarter[(pbp_quarter['EVENTMSGTYPE']==8) & (pbp_quarter['PLAYER2_ID']==player)].index
            subbed_in = subbed_in.to_series().reset_index(drop=True)
            subbed_out = pbp_quarter[(pbp_quarter['EVENTMSGTYPE']==8) & (pbp_quarter['PLAYER1_ID']==player)].index
            subbed_out = subbed_out.to_series().reset_index(drop=True)
            first_time_subbed_in = min(subbed_in) if len(subbed_in)>0 else 9999
            first_time_subbed_out = min(subbed_out) if len(subbed_out)>0 else 9999
            players_dict[player] = {'player_involvements': player_involvements
                                   ,'first_time_subbed_in': first_time_subbed_in
                                   ,'first_time_subbed_out': first_time_subbed_out}
        team_lineup = [player_id for player_id, stats in players_dict.items()
                       if stats['first_time_subbed_in'] >= stats['first_time_subbed_out']]
        return team_lineup

In [12]:
def get_row_lineup(index, row, pbp, current_lineup, quarter):
    # check if there were any subs
    if row['EVENTMSGTYPE']==8:
        # Create the new lineup
        current_lineup = [row['PLAYER2_ID'] if x == row['PLAYER1_ID'] else x for x in current_lineup]
        current_lineup = sorted(current_lineup[:5]) + sorted(current_lineup[5:])
    # if quarter changes get new lineups
    if quarter != row['PERIOD']:
        home_team_lineup = get_team_starting_lineup_quarter(pbp[pbp['PERIOD']==row['PERIOD']], home_team_id)
        away_team_lineup = get_team_starting_lineup_quarter(pbp[pbp['PERIOD']==row['PERIOD']], away_team_id)
        current_lineup = home_team_lineup + away_team_lineup
    return current_lineup

In [13]:
def get_first_possession(pbp_action):
    if pbp_action.iloc[0]['HOMEDESCRIPTION'] is not None:
        current_possession = 1
    else:
        current_possession = 0
    return current_possession

In [14]:
# Go through each season and get the list of games for that season
for season_id in seasons:
    # convert season_id to actual season
    season = get_season(season_id)
    lineup_dict[season_id] = {}
    print("Season:", season)
    # get list of all game IDs for that season
    games = get_season_games(season)
    # set this based on how much was saved on last one
    games = games[games > "0021201214"]
    # Go game by game and accumulate lineup stats
    for game_id in games:
        print("Run:", run, " Game:", game_id)
        if run % 1000 == 0:
            time.sleep(2)
        elif run % 100 == 0:
            time.sleep(1)
        else:
            time.sleep(0.65)
        run += 1
        # Get Team IDs
        home_team_id, away_team_id = get_team_ids(game_id)
        # Get PBP
        pbp = get_pbp(game_id)
        # Get 1st quarter starters
        current_lineup = get_starting_lineup_1q(game_id, home_team_id, away_team_id)
        # loop through play by play and get the lineup for every row
        quarter = 1
        pbp_lineups = []
        for index, row in pbp.iterrows(): 
            current_lineup = get_row_lineup(index, row, pbp, current_lineup, quarter)
            quarter = row['PERIOD']
            pbp_lineups.append(current_lineup)
    
        pbp['lineups'] = pbp_lineups
        
        # remove steal and block info as its clouding possession count
        pbp.loc[pbp['HOMEDESCRIPTION'].str.contains('STEAL|BLOCK', na=False), 'HOMEDESCRIPTION'] = None
        pbp.loc[pbp['VISITORDESCRIPTION'].str.contains('STEAL|BLOCK', na=False), 'VISITORDESCRIPTION'] = None

        # create new pbp dataframe of only action events
        skip_events = [6, 7, 8, 9, 10, 11, 12, 13]
        action_rows = ~pbp['EVENTMSGTYPE'].isin(skip_events)
        pbp_action = pbp[action_rows].reset_index(drop=True)

        # get the team who starts with possession
        last_possession = get_first_possession(pbp_action)

        # initialize variables
        home_score = 0
        away_score = 0
        home_start_score = 0
        away_start_score = 0
        game_possession = 1
        last_quarter = 1
        start_time = "12:00"
        lineup_dict[season_id][game_id] = {"home_team": home_team_id
                                          ,"away_team": away_team_id
                                          ,"plays": {}}
        lineup_dict[season_id][game_id]['plays'][0] = {"players": pbp_lineups[0]
                                                       ,"game_possession": game_possession
                                                       ,"home_possession": last_possession
                                                       ,"points": 0
                                                       ,"event": ""
                                                       ,"description": ""
                                                       ,"start_period": 1
                                                       ,"start_time": start_time
                                                       ,"end_period": ""
                                                       ,"end_time": ""}
        
        for index, row in pbp_action.iterrows():
            if row['SCORE'] is not None:
                away_score, home_score = row['SCORE'].split(' - ')
            if row['HOMEDESCRIPTION'] is not None:
                play_description = row['HOMEDESCRIPTION']
                current_possession = 1
                points = int(home_score)-int(home_start_score)
            else:
                play_description = row['VISITORDESCRIPTION']
                current_possession = 0
                points = int(away_score)-int(away_start_score)
                
            lineup = row['lineups']
            current_quarter = row['PERIOD']
            event = row['EVENTMSGTYPE']
            end_time = row['PCTIMESTRING']
            
            if (last_possession == current_possession) & (last_quarter == current_quarter):
                lineup_dict[season_id][game_id]['plays'][index] = {"players": lineup
                                                                   ,"game_possession": game_possession
                                                                   ,"home_possession": current_possession
                                                                   ,"points": points
                                                                   ,"event": event
                                                                   ,"description": play_description
                                                                   ,"period": current_quarter
                                                                   ,"start_time": start_time
                                                                   ,"end_time": end_time}
            else:
                game_possession += 1
                last_possession = current_possession
                last_quarter = current_quarter
                lineup_dict[season_id][game_id]['plays'][index] = {"players": lineup
                                                                   ,"game_possession": game_possession
                                                                   ,"home_possession": current_possession
                                                                   ,"points": points
                                                                   ,"event": event
                                                                   ,"description": play_description
                                                                   ,"period": current_quarter
                                                                   ,"start_time": start_time
                                                                   ,"end_time": end_time}
            home_start_score = home_score
            away_start_score = away_score
            start_time = end_time

Season: 1996-97
Run: 1  Game: 0029600001
Run: 2  Game: 0029600002
Run: 3  Game: 0029600003
Run: 4  Game: 0029600004
Run: 5  Game: 0029600005
Run: 6  Game: 0029600006
Run: 7  Game: 0029600007
Run: 8  Game: 0029600008
Run: 9  Game: 0029600009
Run: 10  Game: 0029600010
Run: 11  Game: 0029600011
Run: 12  Game: 0029600012
Run: 13  Game: 0029600013
Run: 14  Game: 0029600014
Run: 15  Game: 0029600015
Run: 16  Game: 0029600016
Run: 17  Game: 0029600017
Run: 18  Game: 0029600018
Run: 19  Game: 0029600019
Run: 20  Game: 0029600020
Run: 21  Game: 0029600021
Run: 22  Game: 0029600022
Run: 23  Game: 0029600023
Run: 24  Game: 0029600024
Run: 25  Game: 0029600025
Run: 26  Game: 0029600026
Run: 27  Game: 0029600027
Run: 28  Game: 0029600028
Run: 29  Game: 0029600029
Run: 30  Game: 0029600030
Run: 31  Game: 0029600031
Run: 32  Game: 0029600032
Run: 33  Game: 0029600033
Run: 34  Game: 0029600034
Run: 35  Game: 0029600035
Run: 36  Game: 0029600036
Run: 37  Game: 0029600037
Run: 38  Game: 0029600038
Run: 

In [15]:
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def nested_dict_to_dataframe(nested_dict):
    rows = []
    for season, games in nested_dict.items():
        for game, game_data in games.items():
            base_info = {
                'season': season,
                'game': game,
                'home_team': game_data['home_team'],
                'away_team': game_data['away_team']
            }
            for play, plays_data in game_data['plays'].items():
                flat_dict = flatten_dict(plays_data)
                flat_dict.update(base_info)
                flat_dict['play'] = play
                rows.append(flat_dict)
    return pd.DataFrame(rows)

In [16]:
# Convert the nested dictionary to a DataFrame
df = nested_dict_to_dataframe(lineup_dict)

df.groupby(['game','season','home_possession']).agg(possessions=('game_possession', 'max'), points=('points', 'sum')).reset_index()

Unnamed: 0,game,season,home_possession,possessions,points
0,0029600001,21996,0,192,107
1,0029600001,21996,1,193,98
2,0029600002,21996,0,164,90
3,0029600002,21996,1,165,77
4,0029600003,21996,0,197,111
...,...,...,...,...,...
8579,0029901187,21999,1,203,88
8580,0029901188,21999,0,178,96
8581,0029901188,21999,1,179,95
8582,0029901189,21999,0,187,114


In [17]:
os.chdir('/Users/graallen/Documents/Personal Docs/Grad School/fall_2024/ds_785/capstone')

with open('lineup_dict_0029600001_0029901189.pkl','wb') as file:
    pickle.dump(lineup_dict, file)

### Import data back in for QA

In [14]:
os.chdir('/Users/graallen/Documents/Personal Docs/Grad School/fall_2024/ds_785/capstone')

# Find all files matching the pattern
file_list = glob.glob('lineup_dict_*.pkl')

In [15]:
# Initialize an empty list to store DataFrames
lineups = []

# Iterate over the list of files
for file in file_list:
    with open(file, 'rb') as f:
        lineup_dict = pickle.load(f)
        
        # Convert the dictionary to a DataFrame (assuming the dictionary can be directly converted)
        df = nested_dict_to_dataframe(lineup_dict)
        
        # Append the DataFrame to the list
        lineups.append(df)

# Concatenate all DataFrames into a single DataFrame
lineups_df_raw = pd.concat(lineups, ignore_index=True)

lineups_df_raw.head(10)

ValueError: No objects to concatenate

In [16]:
lineups_df_raw.groupby(['game','season'])['home_possessions','home_score','away_possessions','away_score','seconds_elapsed'].sum()

  lineups_df_raw.groupby(['game','season'])['home_possessions','home_score','away_possessions','away_score','seconds_elapsed'].sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,home_possessions,home_score,away_possessions,away_score,seconds_elapsed
game,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0020000001,22000,93,72,95,101,2880
0020000002,22000,108,82,113,86,2880
0020000003,22000,105,97,107,86,2880
0020000004,22000,98,82,105,106,2880
0020000005,22000,111,95,105,104,2880
...,...,...,...,...,...,...
0029901185,21999,105,103,106,98,3180
0029901186,21999,99,95,105,86,2880
0029901187,21999,110,88,104,99,2880
0029901188,21999,96,95,95,96,2880


In [17]:
lineups_df_raw.groupby('season')[('home_possessions','home_score','away_possessions','away_score','seconds_elapsed')].sum()

  lineups_df_raw.groupby('season')[('home_possessions','home_score','away_possessions','away_score','seconds_elapsed')].sum()


Unnamed: 0_level_0,home_possessions,home_score,away_possessions,away_score,seconds_elapsed
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21996,117652,116737,117205,113689,3450995
21997,118653,115386,118145,111882,3450696
21998,71394,67643,71170,65149,2103874
21999,121610,117999,121181,113790,3445602
22000,118821,114463,118533,110996,3453091
22001,117897,115542,117572,111501,3448017
22002,118455,115360,118111,110742,3452520
22003,117515,113139,116826,108861,3448620
22004,123202,121477,122820,117632,3570000
22005,122701,121392,121992,117249,3573300


In [18]:
lineups_df_raw['unique_players'] = lineups_df_raw['players'].apply(lambda x: len(set(x)))

In [19]:
print(len(lineups_df_raw[lineups_df_raw['unique_players']<10]))
print(len(lineups_df_raw[lineups_df_raw['unique_players']==10]))
print(len(lineups_df_raw[lineups_df_raw['unique_players']>10]))

2800
1459584
24550


In [20]:
print(min(lineups_df_raw['unique_players']))
print(max(lineups_df_raw['unique_players']))

0
13


In [21]:
lineups_df_raw[lineups_df_raw['unique_players']==min(lineups_df_raw['unique_players'])]['game']

30286      0029600332
31761      0029600370
1018157    0021500916
1117666    0029700873
1159613    0029800661
1291622    0020300778
Name: game, dtype: object

In [22]:
lineups_df_raw[lineups_df_raw['unique_players']==max(lineups_df_raw['unique_players'])]['game']

1125935    0029701075
1125936    0029701075
1125937    0029701075
1125938    0029701075
1125939    0029701075
1125940    0029701075
Name: game, dtype: object