In [None]:
import pandas as pd
from nba_api.stats.endpoints import playbyplayv2, leaguegamefinder
from nba_api.stats.library.parameters import SeasonType
import time
import re

In [None]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json, text/plain, */*",
    "Connection": "keep-alive",
    "Referer": "https://www.nba.com/",
}

In [None]:
def get_play_by_play(game_id):
    try:
        pbp = playbyplayv2.PlayByPlayV2(game_id, headers=HEADERS).get_data_frames()[0]
        return pbp
    except Exception as e:
        print(f"Error fetching game {game_id}: {e}")
        return game_id

In [None]:
def safe_split(text):
    return (text or "a a a a a a a a a a").split()  

def is_last_made_free_throw(description):
    if description is None:  # Handle NoneType values
        return False

    words = description.split()
    if words[0] == "MISS":  # Check if the first word is "MISS"
        return False

    # Use regex to find patterns like "2 of 2" or "3 of 3"
    match = re.search(r"(\d+) of (\d+)", description)
    if match:
        made, total = map(int, match.groups())  # Extract numbers
        return made == total  # True if it's the last free throw in the sequence

    return False  # If no "X of X" pattern is found, return False


def read_pbp(df):
    pending_subs = []  # Store substitutions temporarily
    current_stint = {"home": set(), "away": set(), "start_index": 0}

    last_score = (0, 0)  # Track last known score
    last_score_index = None  # Track index of last row with a valid score
    stints = []  # Store all stints
    first12 = False

    stint_start_score = last_score

    for idx, row in df.iterrows():

        # Update last known score if this row has a score
        if isinstance(row["SCORE"], str) and "-" in row["SCORE"]:
            last_score = tuple(map(int, row["SCORE"].split("-")))
            last_score_index = idx

        if row["EVENTMSGTYPE"] == 12:  # Period break detected
            
            if first12:
                # Compute the point differential for the stint that is ending
                points_scored = last_score[1] - stint_start_score[1]  # Home team points scored
                points_allowed = last_score[0] - stint_start_score[0]  # Away team points scored
                
                # Assign the point differential to the stint that just finished
                stints[-1]["points_scored"] = points_scored  # Assign points scored in stint
                stints[-1]["points_allowed"] = points_allowed  # Assign points allowed in stint  # Update previous stint

                if stints[-1]["total_possessions"] % 2 == 0:  # Even possessions
                    stints[-1]["offensive_possessions"] = stints[-1]["total_possessions"] // 2
                    stints[-1]["defensive_possessions"] = stints[-1]["total_possessions"] // 2
                else:  # Odd possessions
                    last_possession_team = "home" if pd.notna(row["HOMEDESCRIPTION"]) else "away"

                    if last_possession_team == "home":
                        stints[-1]["offensive_possessions"] = (stints[-1]["total_possessions"] // 2) + 1
                        stints[-1]["defensive_possessions"] = stints[-1]["total_possessions"] // 2
                    else:
                        stints[-1]["offensive_possessions"] = stints[-1]["total_possessions"] // 2
                        stints[-1]["defensive_possessions"] = (stints[-1]["total_possessions"] // 2) + 1
                                           
                stint_start_score = last_score  
                
            stints.append({
                "home_players": list(),
                "away_players": list(),
                "points_scored": 0,  
                "points_allowed": 0, 
                "total_possessions": 0,  
                "offensive_possessions": 0, 
                "defensive_possessions": 0  

            })

            # Find the end of the current period (EVENTMSGTYPE == 13)
            period_end_idx = df[(df["EVENTMSGTYPE"] == 13) & (df.index > idx)].index.min()

            # Filter the dataframe for substitutions (EVENTMSGTYPE == 8) between the start and end of the period
            period_subs = df.loc[idx:period_end_idx]
            period_subs = period_subs[period_subs["EVENTMSGTYPE"] == 8]

            starting_lineups = {"home": set(), "away": set()}
            substituted_players = {"home": set(), "away": set()}

            for _, nrow in period_subs.iterrows():
                out_player, out_player_name = nrow["PLAYER1_ID"], nrow["PLAYER1_NAME"]
                in_player, in_player_name = nrow["PLAYER2_ID"], nrow["PLAYER2_NAME"]

                # Determine if player is from the Home or Away team
                if pd.isna(nrow["HOMEDESCRIPTION"]):  # If Home_Description is NaN, player is from the Away team
                    team = "away"
                else:  # If Visitor_Description is NaN, player is from the Home team
                    team = "home"

                # Add the first 5 unique subbed-out players as starters
                if out_player not in substituted_players[team] and out_player not in starting_lineups[team]:
                    starting_lineups[team].add(out_player)

                # Track substituted-in players to avoid counting them as starters
                substituted_players[team].add(in_player)

                # Stop once 5 starters are found for each team
                if len(starting_lineups["home"]) == 5 and len(starting_lineups["away"]) == 5:
                    break

            # Handle edge case: If fewer than 5 players were subbed out, find remaining players
            if len(starting_lineups["home"]) < 5 or len(starting_lineups["away"]) < 5:

                period_players_home = set()
                period_players_away = set()

                # Scan all play-by-play events in this period to determine which players were active
                for _, nrow in df.loc[idx:period_end_idx].iterrows():

                    home_desc_filled = pd.notna(nrow["HOMEDESCRIPTION"])
                    visitor_desc_filled = pd.notna(nrow["VISITORDESCRIPTION"])

                    if home_desc_filled ^ visitor_desc_filled:  
                        if pd.notna(nrow["PLAYER1_NAME"]):
                            if pd.notna(nrow["HOMEDESCRIPTION"]): 
                                period_players_home.add(nrow["PLAYER1_ID"])
                            else:
                                period_players_away.add(nrow["PLAYER1_ID"])

                # Identify missing players
                missing_home_players = period_players_home - substituted_players["home"] - starting_lineups["home"]
                missing_away_players = period_players_away - substituted_players["away"] - starting_lineups["away"]

                # Add missing players until the lineup reaches 5
                for player in missing_home_players:
                    if len(starting_lineups["home"]) < 5:
                        starting_lineups["home"].add(player)

                for player in missing_away_players:
                    if len(starting_lineups["away"]) < 5:
                        starting_lineups["away"].add(player)

            # Convert to list for easier access
            current_stint["home"] = starting_lineups["home"].copy()
            current_stint["away"] = starting_lineups["away"].copy()

            stints[-1]['home_players'] = list(starting_lineups["home"].copy())
            stints[-1]['away_players'] = list(starting_lineups["away"].copy())
            
            first12 = True
        # Exclude substitutions (8) and timeouts (9)

        if row["EVENTMSGTYPE"] not in [6, 8, 9]:

            prev_row = df.iloc[idx - 1] if idx > 0 else None

            if prev_row is not None:
                prev_home_desc = prev_row["HOMEDESCRIPTION"]
                prev_visitor_desc = prev_row["VISITORDESCRIPTION"]
                curr_home_desc = row["HOMEDESCRIPTION"]
                curr_visitor_desc = row["VISITORDESCRIPTION"]

            if row['EVENTMSGTYPE'] == 1:
                stints[-1]["total_possessions"] += 1  # Increment possession count for the active stint
               
                
            if row['EVENTMSGTYPE'] == 5:
                stints[-1]["total_possessions"] += 1  # Increment possession count for the active stint
               
            if row['EVENTMSGTYPE'] == 4:
                if (prev_home_desc is None) or (prev_visitor_desc is None):
                    stints[-1]["total_possessions"] += 1  # Increment possession count for the active stint
                    
                elif (safe_split(prev_home_desc)[0] == 'MISS' and safe_split(curr_visitor_desc)[1] == 'REBOUND') or (safe_split(prev_visitor_desc)[0] == 'MISS' and safe_split(curr_home_desc)[1] == 'REBOUND'):
                    stints[-1]["total_possessions"] += 1  # Increment possession count for the active stint
                    
            if row['EVENTMSGTYPE'] == 3:
                if is_last_made_free_throw(curr_home_desc) or is_last_made_free_throw(curr_visitor_desc):
                    stints[-1]["total_possessions"] += 1  # Increment possession count for the active stint

        if row["EVENTMSGTYPE"] == 8:  # Substitution event


            out_player, in_player = row["PLAYER1_ID"], row["PLAYER2_ID"]
            out_name, in_name = row["PLAYER1_NAME"], row["PLAYER2_NAME"]

            # Determine if substitution is for home or away team
            if pd.isna(row["HOMEDESCRIPTION"]):  
                team = "away"
            else:
                team = "home"
            # Store the pending substitution
            pending_subs.append((team, out_player, in_player, out_name, in_name))

            # Check next row: if it's still a substitution, continue batching
            next_event_type = df.iloc[idx + 1]["EVENTMSGTYPE"] if idx + 1 < len(df) else None
            if next_event_type == 8:
                continue  # Skip to next iteration to gather all consecutive subs

            # Compute the point differential for the stint that is ending
            points_scored = last_score[1] - stint_start_score[1]  # Home team points scored
            points_allowed = last_score[0] - stint_start_score[0]  # Away team points scored
            
            # Assign the point differential to the stint that just finished
            stints[-1]["points_scored"] = points_scored  # Assign points scored in stint
            stints[-1]["points_allowed"] = points_allowed  # Assign points allowed in stint  # Update previous stint

            if stints[-1]["total_possessions"] % 2 == 0:  # Even possessions
                stints[-1]["offensive_possessions"] = stints[-1]["total_possessions"] // 2
                stints[-1]["defensive_possessions"] = stints[-1]["total_possessions"] // 2
            else:  # Odd possessions
                last_possession_team = "home" if pd.notna(row["HOMEDESCRIPTION"]) else "away"

                if last_possession_team == "home":
                    stints[-1]["offensive_possessions"] = (stints[-1]["total_possessions"] // 2) + 1
                    stints[-1]["defensive_possessions"] = stints[-1]["total_possessions"] // 2
                else:
                    stints[-1]["offensive_possessions"] = stints[-1]["total_possessions"] // 2
                    stints[-1]["defensive_possessions"] = (stints[-1]["total_possessions"] // 2) + 1
            
            #Apply batched subs at once
            for sub_team, sub_out, sub_in, _, _ in pending_subs:
                if sub_out in current_stint[sub_team]: # Ensure player is in lineup
                    current_stint[sub_team].remove(sub_out)
                    current_stint[sub_team].add(sub_in)

            pending_subs = []  # Clear after processing

            # Start a new stint with the updated lineups
            stints.append({
                "home_players": list(current_stint["home"]),
                "away_players": list(current_stint["away"]),
                "points_scored": 0,  
                "points_allowed": 0,
                "total_possessions": 0,  # Total possessions in stint
                "offensive_possessions": 0,  # Offensive possessions for home team
                "defensive_possessions": 0   # Defensive possessions for home team
            })

            # Reset the starting score for this new stint
            stint_start_score = last_score  

    # Step 4: Capture the final stint of the game
    final_score = last_score  # The last recorded score in the dataset
    final_points_scored = final_score[1] - stint_start_score[1]  # Home team points scored in last stint
    final_points_allowed = final_score[0] - stint_start_score[0]  # Away team points scored in last stint

    stints[-1]["points_scored"] = final_points_scored  # Assign points scored in final stint
    stints[-1]["points_allowed"] = final_points_allowed

    if stints[-1]["total_possessions"] % 2 == 0:  # Even possessions
        stints[-1]["offensive_possessions"] = stints[-1]["total_possessions"] // 2
        stints[-1]["defensive_possessions"] = stints[-1]["total_possessions"] // 2
    else:  # Odd possessions
        last_possession_team = "home" if pd.notna(row["HOMEDESCRIPTION"]) else "away"

    if last_possession_team == "home":
        stints[-1]["offensive_possessions"] = (stints[-1]["total_possessions"] // 2) + 1
        stints[-1]["defensive_possessions"] = stints[-1]["total_possessions"] // 2
    else:
        stints[-1]["offensive_possessions"] = stints[-1]["total_possessions"] // 2
        stints[-1]["defensive_possessions"] = (stints[-1]["total_possessions"] // 2) + 1

    data = pd.DataFrame(stints)
    data['id'] = df['GAME_ID'][0]
    
    return data

In [None]:
nba_team_abbreviations = [
    "ATL", "BOS", "BKN", "CHA", "CHI", "CLE", "DAL", "DEN", "DET", "GSW",
    "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", "MIN", "NOP", "NYK",
    "OKC", "ORL", "PHI", "PHX", "POR", "SAC", "SAS", "TOR", "UTA", "WAS"
]

seasons = ['2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

In [None]:
# Get all regular season game ids

game_ids = []

for s in seasons:
    pull = leaguegamefinder.LeagueGameFinder(season_nullable = s, season_type_nullable = SeasonType.regular)
    games = pd.DataFrame(pull.get_normalized_dict()['LeagueGameFinderResults'])
    unique_ids = set(games[games['TEAM_ABBREVIATION'].isin(nba_team_abbreviations)].reset_index(drop= True)['GAME_ID'])

    game_ids.append(unique_ids)

    time.sleep(2)

In [None]:
# Pull all play-by-play data

all_pbp = []
missed_ids = []

for i, s in enumerate(game_ids):

    for g in s:
        
        pull = get_play_by_play(g)

        if isinstance(pull, int):
            missed_ids.append((pull, seasons[i]))

        else:
            pull['Season'] = seasons[i]
            all_pbp.append(pull)

        time.sleep(2)


while missed_ids:
    missed = missed_ids.copy()

    for g in missed:

        game_id = g[0]
        season = g[1]

        pull = get_play_by_play(g)

        if not isinstance(pull,int):
            pull['Season'] = season
            all_pbp.append(pull)
            missed_ids.remove(g)

        time.sleep(2)

# Now you have all the games!

In [None]:
all_stints = []

for pbp in all_pbp:
    read = read_pbp(pbp)
    all_stints.append(read)

In [None]:
stints_df = pd.concat(all_stints, ignore_index = True)
stints_df.to_csv("../datasets/processed_pbp_stints.csv", index=False)