In [1]:
import requests
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
BASE_URL = "https://api3.natst.at"
API_KEY = os.getenv("NAT_API_KEY")

In [None]:
def list_all_dates_for_season(league, year):
    url = f"{BASE_URL}/{API_KEY}/games/{league}/{year}"
    print(url)

    # initialize set
    dates = set()
    res = requests.get(url).json()
    for key, value in res['games'].items():
        dates.add(value['gameday'])
    next_page_url = res['meta'].get('page-next', False)
    
    # loop through pages
    while(next_page_url is not False):
        print(next_page_url)
        res = requests.get(next_page_url).json()
        for key, value in res['games'].items():
            dates.add(value['gameday'])
        next_page_url = res['meta'].get('page-next', False)
    
    return dates

def get_pbp_by_date(league, date):
    url = f"{BASE_URL}/{API_KEY}/playbyplay/{league}/{date}"
    print(url)

    # initialize df
    df = pd.DataFrame()
    res = requests.get(url).json()
    for key, value in res['playbyplay'].items():
        df_temp = pd.json_normalize(value)
        df = pd.concat([df, df_temp])
    next_page_url = res['meta'].get('page-next', False)

    # loop through pages
    while(next_page_url is not False):
        print(next_page_url)
        res = requests.get(next_page_url).json()
        for key, value in res['playbyplay'].items():
            df_temp = pd.json_normalize(value)
            df = pd.concat([df, df_temp])
        next_page_url = res['meta'].get('page-next', False)

    return df

def get_list_of_game_codes_from_single_day_pbp(df):
    return df['game.code'].unique().tolist()

def get_final_score(df, game_code):
    last_row = df[df['game.code'] == game_code].tail(1)
    home_team = last_row.iloc[0]['game.home']
    away_team = last_row.iloc[0]['game.visitor']
    winner = home_team if int(last_row.iloc[0]['game.score-home']) > int(last_row.iloc[0]['game.score-vis']) else away_team
    return {
        'gameTitle': last_row.iloc[0]['game.description'],
        'date': last_row.iloc[0]['game.gameday'],
        'home': home_team,
        'away': away_team,
        'winner': winner,
        'winnerScore': max(last_row.iloc[0]['game.score-home'], last_row.iloc[0]['game.score-vis']),
        'loseScore': min(last_row.iloc[0]['game.score-home'], last_row.iloc[0]['game.score-vis'])
    }

def clean_pbp_df(df: pd.DataFrame):
    mapping = {
        'Atlanta': 'Atlanta Hawks',
        'Boston': 'Boston Celtics',
        'Brooklyn': 'Brooklyn Nets',
        'Charlotte': 'Charlotte Hornets',
        'Chicago': 'Chicago Bulls',
        'Cleveland': 'Cleveland Cavaliers',
        'Dallas': 'Dallas Mavericks',
        'Denver': 'Denver Nuggets',
        'Detroit': 'Detroit Pistons',
        'Golden State': 'Golden State Warriors',
        'Houston': 'Houston Rockets',
        'Indiana': 'Indiana Pacers',
        'L.A. Clippers': 'Los Angeles Clippers',
        'L.A. Lakers': 'Los Angeles Lakers',
        'Memphis': 'Memphis Grizzlies',
        'Miami': 'Miami Heat',
        'Milwaukee': 'Milwaukee Bucks',
        'Minnesota': 'Minnesota Timberwolves',
        'New Orleans': 'New Orleans Pelicans',
        'New York': 'New York Knicks',
        'Oklahoma City': 'Oklahoma City Thunder',
        'Orlando': 'Orlando Magic',
        'Philadelphia': 'Philadelphia 76ers',
        'Phoenix': 'Phoenix Suns',
        'Portland': 'Portland Trail Blazers',
        'Sacramento': 'Sacramento Kings',
        'San Antonio': 'San Antonio Spurs',
        'Toronto': 'Toronto Raptors',
        'Utah': 'Utah Jazz',
        'Washington': 'Washington Wizards'
    }
    df['game.visitor'] = df['game.visitor'].map(mapping).fillna(df['game.visitor'])
    df['game.home'] = df['game.home'].map(mapping).fillna(df['game.home'])

In [None]:
active_game_dates = list_all_dates_for_season("NBA", 2024)
active_game_dates

In [11]:
len(active_game_dates)

208

In [5]:
df = pd.read_csv('outputs/full_output.csv')
# df = df[df['game.home'].notna()] #filter non-null rows out
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)
df

  df = pd.read_csv('outputs/full_output.csv')


Unnamed: 0,id,explanation,scoringplay,tags,thediff,players.secondary.code,players.secondary.name,team.code,team.team,opponent.opponent_id,...,venue.location,venue.longitude,venue.latitude,gametype.series,gametype.seriesname,gametype.seriesgameno,gametype.seriesstatus,players.primary.code,players.primary.name,distance
0,40672415,"Myles Turner vs, Al Horford (Pascal Siakam gai...",N,MISC,0,119739.0,Pascal Siakam,5,Boston Celtics,18,...,"Boston, MA",-71.062390,42.365340,2024-BOS|IND-CF,Conference Finals,1.0,BOS leads 1-0,,,
1,40672416,Aaron Nesmith bad pass (Jayson Tatum steals),N,TO,0,329820.0,Jayson Tatum,5,Boston Celtics,18,...,"Boston, MA",-71.062390,42.365340,2024-BOS|IND-CF,Conference Finals,1.0,BOS leads 1-0,,,
2,40672417,Jaylen Brown makes dunk (Jayson Tatum assists),Y,FGM|FGA|AST,-2,329820.0,Jayson Tatum,18,Indiana Pacers,5,...,"Boston, MA",-71.062390,42.365340,2024-BOS|IND-CF,Conference Finals,1.0,BOS leads 1-0,119722.0,Jaylen Brown,
3,40672418,Al Horford blocks Pascal Siakam 1-foot driving...,N,FGA|BLK,2,119739.0,Pascal Siakam,5,Boston Celtics,18,...,"Boston, MA",-71.062390,42.365340,2024-BOS|IND-CF,Conference Finals,1.0,BOS leads 1-0,1754.0,Al Horford,1.0
4,40672419,Celtics Full timeout,N,TIMEOUT,-2,84053334.0,Nae'Qwan Tomlin,18,Indiana Pacers,5,...,"Boston, MA",-71.062390,42.365340,2024-BOS|IND-CF,Conference Finals,1.0,BOS leads 1-0,84053334.0,Nae'Qwan Tomlin,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570996,40632234,Jusuf Nurkic makes free throw 2 of 2,Y,FTM|FTA,-1,84053334.0,Nae'Qwan Tomlin,15,Sacramento Kings,14,...,"Sacramento, CA",-121.525154,38.525231,,,,,2867.0,Jusuf Nurkic,
570997,40632235,Kings Full timeout,N,TIMEOUT,1,84053334.0,Nae'Qwan Tomlin,14,Phoenix Suns,15,...,"Sacramento, CA",-121.525154,38.525231,,,,,84053334.0,Nae'Qwan Tomlin,
570998,40632236,Royce O'Neale replaces Jusuf Nurkic,N,SUB,-1,2867.0,Jusuf Nurkic,15,Sacramento Kings,14,...,"Sacramento, CA",-121.525154,38.525231,,,,,,,
570999,40632237,Bradley Beal personal foul,N,PF,-1,84053334.0,Nae'Qwan Tomlin,15,Sacramento Kings,14,...,"Sacramento, CA",-121.525154,38.525231,,,,,242.0,Bradley Beal,


In [24]:
# continue querying
already_queried_dates = set(df['game.gameday'])
remaining = active_game_dates - already_queried_dates
len(remaining)

0

In [6]:
# for date in remaining:
#     print(f"Querying for day: {date}...")
#     single_day_pbp_df = get_pbp_by_date("NBA", date)
#     df = pd.concat([df, single_day_pbp_df])
#     print("Successfully added day. Sleeping for 5s...")
#     time.sleep(5)

In [12]:
clean_pbp_df(df)

In [7]:
sorted(df['game.home'].unique().tolist())

['Atlanta Hawks',
 'Boston Celtics',
 'Brooklyn Nets',
 'Charlotte Hornets',
 'Chicago Bulls',
 'Cleveland Cavaliers',
 'Dallas Mavericks',
 'Denver Nuggets',
 'Detroit Pistons',
 'Golden State Warriors',
 'Houston Rockets',
 'Indiana Pacers',
 'Los Angeles Clippers',
 'Los Angeles Lakers',
 'Memphis Grizzlies',
 'Miami Heat',
 'Milwaukee Bucks',
 'Minnesota Timberwolves',
 'New Orleans Pelicans',
 'New York Knicks',
 'Oklahoma City Thunder',
 'Orlando Magic',
 'Philadelphia 76ers',
 'Phoenix Suns',
 'Portland Trail Blazers',
 'Sacramento Kings',
 'San Antonio Spurs',
 'Toronto Raptors',
 'Utah Jazz',
 'Washington Wizards']

In [52]:
df.to_csv('full_output.csv')

In [8]:
len(get_list_of_game_codes_from_single_day_pbp(df))

1247

In [13]:
df.columns

Index(['id', 'explanation', 'scoringplay', 'tags', 'thediff',
       'players.secondary.code', 'players.secondary.name', 'team.code',
       'team.team', 'opponent.opponent_id', 'opponent.opponent', 'game.code',
       'game.gameday', 'game.description', 'game.visitor', 'game.visitor-code',
       'game.score-vis', 'game.home', 'game.home-code', 'game.score-home',
       'game.period', 'game.time', 'game.sequence', 'game.playoffs',
       'game.finals', 'game.onfloorvis', 'game.onfloorhome', 'venue.code',
       'venue.name', 'venue.location', 'venue.longitude', 'venue.latitude',
       'gametype.series', 'gametype.seriesname', 'gametype.seriesgameno',
       'gametype.seriesstatus', 'players.primary.code', 'players.primary.name',
       'distance'],
      dtype='object')

In [None]:
def get_deficit_time(df, game_code):
    box_score = get_final_score(df, game_code)
    winning_team = box_score['winner']

    df_game = df[df['game.code'] == game_code]
    df_winner = df_game[df_game['team.team'] == winning_team]
    
    largest_deficit = min(pd.to_numeric(df_winner['thediff'], errors='coerce', downcast='integer'))
    df_largest_deficit = find_largest_comeback(df_winner, largest_deficit)

    df_latest_deficit = dffind_latest_comeback(df_winner)
    
    ret_val =  {
        'game title': box_score['gameTitle'],
        'date': box_score['date'],
        'game_id': game_code,
        'winning_team': winning_team,
        'largest_deficit': largest_deficit,
        'largest_deficit_period': df_largest_deficit.iloc[0]['game.period'],
        'largest_deficit_clock': df_largest_deficit.iloc[0]['game.time'],
        'largest_deficit_seconds_remaining': get_seconds_remaining(df_largest_deficit.iloc[0]['game.period'], df_largest_deficit.iloc[0]['game.time']),
        'largest_deficit_analytical_diff': df_largest_deficit.iloc[0]['analytical_diff'],

        'latest_deficit': df_latest_deficit.iloc[0]['thediff'],
        'latest_deficit_period': df_latest_deficit.iloc[0]['game.period'],
        'latest_deficit_clock': df_latest_deficit.iloc[0]['game.time'],
        'latest_deficit_seconds_remaining': get_seconds_remaining(df_latest_deficit.iloc[0]['game.period'], df_latest_deficit.iloc[0]['game.time']),
        'latest_deficit_analytical_diff': df_latest_deficit.iloc[0]['analytical_diff']
    }
    print(f"Game: {box_score['gameTitle']} ({box_score['date']})")
    # print(ret_val)
    return ret_val

def get_seconds_remaining(period, clock: str):
    minutes, seconds = clock.split(':')
    if period < 4:
        return (4-int(period))*12*60 + int(minutes)*60 + float(seconds)
    else:
        return int(minutes)*60 + float(seconds)

def find_largest_comeback(df, largest_deficit):
    df = df.copy()
    df = df[df['thediff'] == largest_deficit]

    df['thediff'] = pd.to_numeric(df['thediff'], errors='coerce')

    # df['seconds_remaining'] = df['game.period'].apply(lambda x: 4 - int(x)) * 12 * 60 + df['game.time'].apply(lambda x: int(x.split(':')[0]) * 60 + float(x.split(':')[1]))
    df['seconds_remaining'] = df.apply(lambda row: get_seconds_remaining(row['game.period'], row['game.time']), axis=1)
    df['analytical_lead_minus_five_abs'] = df['thediff'].apply(lambda x: max(x + 5, 0))
    df['analytical_time_required'] = df['analytical_lead_minus_five_abs'] ** 2
    df['analytical_diff'] = df['analytical_time_required'] - df['seconds_remaining']

    df.sort_values(by='seconds_remaining', ascending=False, inplace=True)
    return df.tail(1)


def dffind_latest_comeback(df):
    # Add new columns
    df = df.copy()  # Avoid SettingWithCopyWarning
    min_threshold = max(min(df['thediff']), 0)
    df = df[df['thediff'] <= min_threshold]
    
    # Ensure 'thediff' and 'seconds_remaining' are numeric
    df['thediff'] = pd.to_numeric(df['thediff'], errors='coerce')

    # remove overtime finishes
    df = df[~df['game.time'].isin(["0:00.0", "0:00"]) ]


    # df = df[df['thediff'] <= 0]  # Filter for negative deficits
    df['seconds_remaining'] = df.apply(lambda row: get_seconds_remaining(row['game.period'], row['game.time']), axis=1)
    df['analytical_lead_minus_five_abs'] = df['thediff'].apply(lambda x: min(x + 5, 0))
    df['analytical_time_required'] = df['analytical_lead_minus_five_abs'] ** 2
    df['analytical_diff'] = df['analytical_time_required'] - df['seconds_remaining']

    # Find the closest comeback
    closest_diff = max(pd.to_numeric(df['analytical_diff'], errors='coerce', downcast='integer'))
    df = df[df['analytical_diff'] == closest_diff].tail(1)
    return df

comebacks = []
for game in get_list_of_game_codes_from_single_day_pbp(df):
    output = get_deficit_time(df, game)
    comebacks.append(output)

Game: Indiana Pacers at Boston Celtics (2024-05-21)
Game: Brooklyn Nets at Boston Celtics (2023-11-10)
Game: Charlotte Hornets at Washington Wizards (2023-11-10)
Game: Los Angeles Clippers at Dallas Mavericks (2023-11-10)
Game: Philadelphia 76ers at Detroit Pistons (2023-11-10)
Game: New Orleans Pelicans at Houston Rockets (2023-11-10)
Game: Los Angeles Lakers at Phoenix Suns (2023-11-10)
Game: Utah Jazz at Memphis Grizzlies (2023-11-10)
Game: Minnesota Timberwolves at San Antonio Spurs (2023-11-10)
Game: Oklahoma City Thunder at Sacramento Kings (2023-11-10)
Game: Sacramento Kings at New Orleans Pelicans (2024-04-19)
Game: Chicago Bulls at Miami Heat (2024-04-19)
Game: Atlanta Hawks at Philadelphia 76ers (2023-12-08)
Game: Washington Wizards at Brooklyn Nets (2023-12-08)
Game: Toronto Raptors at Charlotte Hornets (2023-12-08)
Game: Chicago Bulls at San Antonio Spurs (2023-12-08)
Game: Cleveland Cavaliers at Miami Heat (2023-12-08)
Game: Dallas Mavericks at Portland Trail Blazers (2023

In [74]:
comebacks_df = pd.DataFrame(comebacks)
comebacks_df

Unnamed: 0,game title,date,game_id,winning_team,largest_deficit,largest_deficit_period,largest_deficit_clock,largest_deficit_seconds_remaining,largest_deficit_analytical_diff,latest_deficit,latest_deficit_period,latest_deficit_clock,latest_deficit_seconds_remaining,latest_deficit_analytical_diff
0,Indiana Pacers at Boston Celtics,2024-05-21,1079251,Boston Celtics,-5,4,1:43,103.0,-103.0,0,4,0:05.1,5.1,-5.1
1,Brooklyn Nets at Boston Celtics,2023-11-10,1076985,Boston Celtics,-1,1,10:28,2788.0,-2772.0,-1,1,10:28,2788.0,-2788.0
2,Charlotte Hornets at Washington Wizards,2023-11-10,1077137,Charlotte Hornets,-16,2,6:45,1845.0,-1845.0,-2,4,4:29,269.0,-269.0
3,Los Angeles Clippers at Dallas Mavericks,2023-11-10,1077344,Dallas Mavericks,-11,1,1:53,2273.0,-2273.0,-1,2,11:12,2112.0,-2112.0
4,Philadelphia 76ers at Detroit Pistons,2023-11-10,1077482,Philadelphia 76ers,-15,2,2:39,1599.0,-1599.0,-1,3,2:20,860.0,-860.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1242,Houston Rockets at Portland Trail Blazers,2024-04-12,1077652,Houston Rockets,-8,1,4:58,2458.0,-2458.0,-1,2,11:20,2120.0,-2120.0
1243,Utah Jazz at Los Angeles Clippers,2024-04-12,1077749,Utah Jazz,-13,1,0:28.0,2188.0,-2188.0,-1,4,2:42,162.0,-162.0
1244,Toronto Raptors at Miami Heat,2024-04-12,1077877,Miami Heat,0,1,7:52,2632.0,-2607.0,0,1,7:52,2632.0,-2632.0
1245,Milwaukee Bucks at Oklahoma City Thunder,2024-04-12,1077911,Oklahoma City Thunder,-3,1,9:26,2726.0,-2722.0,0,1,6:17,2537.0,-2537.0


In [75]:
comebacks_df.to_csv('outputs/comebacks_v1.csv')