# NBA Shot Data 4 :: NBA-API connector

## Trevor Rowland :: 2/2/2025

This notebook will connect to [nba_api](<https://github.com/swar/nba_api>) and pull team and player statistics for each season, from 2004 to 2024. A data dictionary for the api data can be found [here](<link-to-data-dir-in-gh>)

## 1. Importing Packages and Data

In [12]:
import pandas as pd
import polars as pl
import numpy as np

df = pd.read_pickle('/Users/dB/Documents/repos/github/bint-capstone/data-sources/nba/all-shots.pkl')
df = df.to_pandas(use_pyarrow_extension_array=True)

## 2. Attempting to Connect to Basketball-Reference

### 1.a. Importing Packages

In [13]:
import pandas as pd
import polars as pl
import numpy as np
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import (
    teamyearbyyearstats,
    leaguedashteamstats,
    leaguedashplayerstats,
    commonteamroster,
    boxscoresummaryv2,
    boxscoretraditionalv2,
    LeagueGameFinder
)
from tqdm import tqdm
import time
import random

### 1.b. Team Data

`get_all_teams()`

Retrieve all NBA teams as a pandas DataFrame.
    
_Returns_:

pd.DataFrame: DataFrame of NBA teams with their details

In [14]:
def get_all_teams()->pd.DataFrame:
    teams_list = teams.get_teams()
    return pd.DataFrame(teams_list)

`get_team_ids()`

Extract active team IDs as a pandas Series.
    
_Returns_:

pd.Series: Series of active team IDs

In [15]:
def get_team_ids():
    return pd.Series([team['id'] for team in teams.get_teams()])

`collect_team_stats(start_year:int, end_year:int)`

Collect yearly team statistics.
    
Args:

start_year (int): Starting year for data collection

end_year (int): Ending year for data collection
    
_Returns_:

pd.DataFrame: Comprehensive team statistics across seasons


In [16]:
def collect_team_stats(start_year=2004, end_year=2024):
    """
    Collect yearly team statistics with robust error handling.
    
    Args:
    start_year (int): Starting year for data collection
    end_year (int): Ending year for data collection
    
    Returns:
    pd.DataFrame: Comprehensive team statistics across seasons
    """
    team_stats_list = []
    team_ids = get_team_ids()
    
    for team_id in tqdm(team_ids, desc="Collecting Team Stats"):
        for season in range(start_year, end_year + 1):
            try:
                # Convert year to NBA season format (e.g., 2020-21)
                season_str = f"{season}-{str(season+1)[-2:]}"
                
                # Collect team stats
                team_stats = leaguedashteamstats.LeagueDashTeamStats(
                    season=season_str
                )
                
                # Directly convert to DataFrame
                df = team_stats.get_data_frames()[0]
                
                # Add team_id and season columns
                df['TEAM_ID'] = team_id
                df['SEASON'] = season_str
                
                team_stats_list.append(df)
                
                # Randomized rate limiting to avoid predictable patterns
                time.sleep(random.uniform(1.5, 3.5))
            
            except Exception as e:
                print(f"Error collecting stats for team {team_id} in season {season_str}: {e}")
                # Wait longer on failure with some randomness
                time.sleep(random.uniform(4, 7))
                continue
    
    # Combine all team stats into a single DataFrame
    if team_stats_list:
        final_df = pd.concat(team_stats_list, ignore_index=True)
        
        # Clean column names
        final_df.columns = [col.lower().replace(' ', '_') for col in final_df.columns]
        
        return final_df
    else:
        print("No team stats collected. Check network or API issues.")
        return pd.DataFrame()

`get_team_roster(team_id:int, season:str)`

Retrieve team roster for a specific season.
    
Args:

team_id (int): NBA team ID

season (str): NBA season in format 'YYYY-YY'
    
Returns:

pd.DataFrame: Team roster details

In [17]:
def get_team_roster(team_id, season):
    try:
        roster = commonteamroster.CommonTeamRoster(team_id=team_id, season=season)
        
        # Get DataFrame directly and clean column names
        df = roster.get_data_frames()[0]
        df.columns = [col.lower().replace(' ', '_') for col in df.columns]
        
        # Add team_id and season columns
        df['team_id'] = team_id
        df['season'] = season
        
        return df
    except Exception as e:
        print(f"Error collecting roster for team {team_id} in season {season}: {e}")
        return pd.DataFrame()

### 1.c. Player Data

`collect_player_stats(start_year:int, end_year:int)`
    
Collect comprehensive player statistics.
    
Args:
    
start_year (int): Starting year for data collection
    
end_year (int): Ending year for data collection
    
    
Returns:
    
pd.DataFrame: Comprehensive player statistics across seasons


In [18]:
def collect_player_stats(start_year=2004, end_year=2024):
    """
    Collect comprehensive player statistics with robust error handling.
    
    Args:
    start_year (int): Starting year for data collection
    end_year (int): Ending year for data collection
    
    Returns:
    pd.DataFrame: Comprehensive player statistics across seasons
    """
    player_stats_list = []
    
    for season in tqdm(range(start_year, end_year + 1), desc="Collecting Player Stats"):
        try:
            # Convert year to NBA season format (e.g., 2020-21)
            season_str = f"{season}-{str(season+1)[-2:]}"
            
            # Collect player stats for the season
            player_stats = leaguedashplayerstats.LeagueDashPlayerStats(
                season=season_str
            )
            
            # Get DataFrame directly
            df = player_stats.get_data_frames()[0]
            
            # Add season column and clean column names
            df['SEASON'] = season_str
            df.columns = [col.lower().replace(' ', '_') for col in df.columns]
            
            player_stats_list.append(df)
            
            # Randomized rate limiting
            time.sleep(random.uniform(1.5, 3.5))
        
        except Exception as e:
            print(f"Error collecting player stats for season {season_str}: {e}")
            # Wait longer on failure with some randomness
            time.sleep(random.uniform(4, 7))
            continue
    
    # Combine all player stats into a single DataFrame
    if player_stats_list:
        final_df = pd.concat(player_stats_list, ignore_index=True)
        return final_df
    else:
        print("No player stats collected. Check network or API issues.")
        return pd.DataFrame()

### 1.d. Testing

In [19]:
# Collect team stats
data_dir = '/Users/dB/Documents/repos/github/bint-capstone/data-sources/nba'
print("Talking to the API...")
print("Collecting Team Statistics...")
#team_stats = collect_team_stats()
    
# Collect player stats
print("Collecting Player Statistics...")
#player_stats = collect_player_stats()

print("Data pulled from API.")

Talking to the API...
Collecting Team Statistics...
Collecting Player Statistics...
Data pulled from API.


## 2. Writing to CSV and PKL

These files will be saved to the OneDrive

In [20]:
print('Writing Player Stats to Folder')
#team_stats.to_csv(f'{data_dir}/nba_team_stats_2004_2024.csv')
#team_stats.to_pickle(f'{data_dir}/nba_team_stats_2004_2024.pkl')

print('Writing Player Stats to Folder')
#player_stats.to_csv(f'{data_dir}/nba_player_stats_2004_2024.csv')
#player_stats.to_pickle(f'{data_dir}/nba_player_stats_2004_2024.pkl')

Writing Player Stats to Folder
Writing Player Stats to Folder


## 3. Game Data

Additionally, we need game-level data of NBA games. These are stats from each game from 2004 to 2024.

In [26]:
def get_season_game_ids(season):
    """
    Retrieve game IDs for a specific season.

    Args:
    season (str): NBA season in format 'YYYY-YY'

    Returns:
    list: List of game IDs for the season
    """
    try:
        # Get all games from the API
        game_finder = LeagueGameFinder()
        games_df = game_finder.get_data_frames()[0]

        # Filter for the specified season
        season_games = games_df[games_df['SEASON_ID'] == f"2{season[:4]}"]  # Format season ID correctly
        
        return season_games['GAME_ID'].unique().tolist()
    except Exception as e:
        print(f"Error retrieving game IDs for season {season}: {e}")
        return []


In [27]:
def collect_game_data(start_year=2004, end_year=2024):
    """
    Collect comprehensive game data across multiple seasons.
    
    Args:
    start_year (int): Starting year for data collection
    end_year (int): Ending year for data collection
    
    Returns:
    tuple: DataFrames containing game summary and team stats
    """
    game_summary_list = []
    team_game_stats_list = []
    
    # Iterate through seasons
    for season in tqdm(range(start_year, end_year), desc="Collecting Game Data"):
        # Convert to NBA season format
        season_str = f"{season}-{str(season+1)[-2:]}"
        season_id = f"{season}2"  # NBA's season ID format
        
        try:
            # Get game IDs for the season
            game_ids = get_season_game_ids(season_id)
            
            # Iterate through game IDs
            for game_id in tqdm(game_ids, desc=f"Processing games in {season_str}", leave=False):
                try:
                    # Collect game summary
                    summary = boxscoresummaryv2.BoxScoreSummaryV2(game_id=game_id)
                    summary_df = summary.get_data_frames()[0]
                    summary_df['SEASON'] = season_str
                    game_summary_list.append(summary_df)
                    
                    # Collect team game stats
                    team_stats = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
                    team_stats_df = team_stats.get_data_frames()[1]  # Team stats are in the second DataFrame
                    team_stats_df['SEASON'] = season_str
                    team_stats_df['GAME_ID'] = game_id
                    team_game_stats_list.append(team_stats_df)
                    
                    # Randomized rate limiting
                    time.sleep(random.uniform(0.5, 1.5))
                
                except Exception as e:
                    print(f"Error processing game {game_id} in {season_str}: {e}")
                    time.sleep(random.uniform(2, 4))
                    continue
        
        except Exception as e:
            print(f"Error processing season {season_str}: {e}")
            continue
    
    # Combine collected data
    if game_summary_list and team_game_stats_list:
        game_summary_final = pd.concat(game_summary_list, ignore_index=True)
        team_game_stats_final = pd.concat(team_game_stats_list, ignore_index=True)
        
        # Clean column names
        game_summary_final.columns = [col.lower().replace(' ', '_') for col in game_summary_final.columns]
        team_game_stats_final.columns = [col.lower().replace(' ', '_') for col in team_game_stats_final.columns]
        
        return game_summary_final, team_game_stats_final
    else:
        print("No game data collected. Check network or API issues.")
        return pd.DataFrame(), pd.DataFrame()

In [31]:
game_summary, team_game_stats = collect_game_data()

Collecting Game Data:   0%|          | 0/20 [00:09<?, ?it/s]


KeyboardInterrupt: 

In [30]:
 import time
import random
import pandas as pd
from tqdm import tqdm
from nba_api.stats.endpoints import LeagueGameFinder, BoxScoreSummaryV2, BoxScoreTraditionalV2

def fetch_nba_game_data(start_year=2004, end_year=2024):
    """
    Fetch game-by-game NBA data from nba_api for all games between start_year and end_year.
    
    Args:
    start_year (int): Starting season year (e.g., 2004 for 2004-05 season)
    end_year (int): Ending season year (e.g., 2024 for 2023-24 season)
    
    Returns:
    tuple: DataFrames (game_summaries, team_stats) containing detailed game data
    """
    game_summary_list = []
    team_stats_list = []
    
    # Retrieve all games from the LeagueGameFinder
    game_finder = LeagueGameFinder()
    games_df = game_finder.get_data_frames()[0]
    
    # Iterate through each season
    for season in tqdm(range(start_year, end_year), desc="Processing Seasons"):
        season_id = f"2{season}"  # Example: "22004" for 2004-05 season
        season_games = games_df[games_df['SEASON_ID'] == season_id]

        # Extract unique game IDs
        game_ids = season_games['GAME_ID'].unique().tolist()
        
        for game_id in tqdm(game_ids, desc=f"Fetching games for {season}-{season+1}", leave=False):
            try:
                # Fetch game summary
                summary = BoxScoreSummaryV2(game_id=game_id).get_data_frames()[0]
                summary['SEASON'] = f"{season}-{str(season+1)[-2:]}"
                game_summary_list.append(summary)

                # Fetch team statistics
                team_stats = BoxScoreTraditionalV2(game_id=game_id).get_data_frames()[1]
                team_stats['SEASON'] = f"{season}-{str(season+1)[-2:]}"
                team_stats['GAME_ID'] = game_id
                team_stats_list.append(team_stats)

                # Randomized sleep to avoid API rate limits
                time.sleep(random.uniform(0.5, 1.5))

            except Exception as e:
                print(f"Error processing game {game_id} in {season}-{season+1}: {e}")
                time.sleep(random.uniform(2, 4))
                continue

    # Convert lists to DataFrames
    game_summary_df = pd.concat(game_summary_list, ignore_index=True) if game_summary_list else pd.DataFrame()
    team_stats_df = pd.concat(team_stats_list, ignore_index=True) if team_stats_list else pd.DataFrame()

    # Clean column names
    game_summary_df.columns = [col.lower().replace(' ', '_') for col in game_summary_df.columns]
    team_stats_df.columns = [col.lower().replace(' ', '_') for col in team_stats_df.columns]

    return game_summary_df, team_stats_df


In [25]:
if not game_summary.empty:
    game_summary.to_csv(f'{data_dir}/game_summary.csv')
    game_summary.to_pickle(f'{data_dir}/game_summary.pkl')

if not team_game_stats.empty:
    team_game_stats.to_csv(f'{data_dir}/team_stats_by_game.csv')
    team_game_stats.to_pickle(f'{data_dir}/team_stats_by_game.pkl')

Now we have player and team stats from 2004 to 2024. The following notebook will perform an EDA on the collected API data