In [None]:
import requests
import pandas as pd
import itertools
import time
import numpy as np

import os
from dotenv import load_dotenv

In [None]:
load_dotenv()
BASE_URL = "https://api3.natst.at"
API_KEY = os.getenv("NAT_API_KEY")

In [None]:
def ncaab_list_all_dates_for_season(league, year):
    url = f"{BASE_URL}/{API_KEY}/games/{league}/{year}"
    print(url)

    # initialize set
    dates = set()
    res = requests.get(url).json()
    for key, value in res['games'].items():
        dates.add(value['gameday'])
    next_page_url = res['meta'].get('page-next', False)
    
    # loop through pages
    while(next_page_url is not False):
        print(next_page_url)
        res = requests.get(next_page_url).json()
        for key, value in res['games'].items():
            dates.add(value['gameday'])
        next_page_url = res['meta'].get('page-next', False)
    
    return dates

def ncaab_get_pbp_by_date(league, date):
    url = f"{BASE_URL}/{API_KEY}/playbyplay/{league}/{date}"
    print(url)

    # initialize df
    df = pd.DataFrame()
    res = requests.get(url).json()
    for key, value in res['playbyplay'].items():
        df_temp = pd.json_normalize(value)
        df = pd.concat([df, df_temp])
    next_page_url = res['meta'].get('page-next', False)

    # loop through pages
    while(next_page_url is not False):
        print(next_page_url)
        res = requests.get(next_page_url).json()
        for key, value in res['playbyplay'].items():
            df_temp = pd.json_normalize(value)
            df = pd.concat([df, df_temp])
        next_page_url = res['meta'].get('page-next', False)

    return df

def ncaab_get_list_of_game_codes_from_single_day_pbp(df):
    return df['game.code'].unique().tolist()

def ncaab_get_final_score(df, game_code):
    last_row = df[df['game.code'] == game_code].tail(1)
    home_team = last_row.iloc[0]['game.home']
    away_team = last_row.iloc[0]['game.visitor']
    winner = home_team if int(last_row.iloc[0]['game.score-home']) > int(last_row.iloc[0]['game.score-vis']) else away_team
    return {
        'gameTitle': last_row.iloc[0]['game.description'],
        'date': last_row.iloc[0]['game.gameday'],
        'home': home_team,
        'away': away_team,
        'winner': winner,
        'winnerScore': max(last_row.iloc[0]['game.score-home'], last_row.iloc[0]['game.score-vis']),
        'loseScore': min(last_row.iloc[0]['game.score-home'], last_row.iloc[0]['game.score-vis'])
    }

def ncaab_clean_pbp_df(df: pd.DataFrame):
    mapping = {
        'Atlanta': 'Atlanta Hawks',
        'Boston': 'Boston Celtics',
        'Brooklyn': 'Brooklyn Nets',
        'Charlotte': 'Charlotte Hornets',
        'Chicago': 'Chicago Bulls',
        'Cleveland': 'Cleveland Cavaliers',
        'Dallas': 'Dallas Mavericks',
        'Denver': 'Denver Nuggets',
        'Detroit': 'Detroit Pistons',
        'Golden State': 'Golden State Warriors',
        'Houston': 'Houston Rockets',
        'Indiana': 'Indiana Pacers',
        'L.A. Clippers': 'Los Angeles Clippers',
        'L.A. Lakers': 'Los Angeles Lakers',
        'Memphis': 'Memphis Grizzlies',
        'Miami': 'Miami Heat',
        'Milwaukee': 'Milwaukee Bucks',
        'Minnesota': 'Minnesota Timberwolves',
        'New Orleans': 'New Orleans Pelicans',
        'New York': 'New York Knicks',
        'Oklahoma City': 'Oklahoma City Thunder',
        'Orlando': 'Orlando Magic',
        'Philadelphia': 'Philadelphia 76ers',
        'Phoenix': 'Phoenix Suns',
        'Portland': 'Portland Trail Blazers',
        'Sacramento': 'Sacramento Kings',
        'San Antonio': 'San Antonio Spurs',
        'Toronto': 'Toronto Raptors',
        'Utah': 'Utah Jazz',
        'Washington': 'Washington Wizards'
    }
    df['game.visitor'] = df['game.visitor'].map(mapping).fillna(df['game.visitor'])
    df['game.home'] = df['game.home'].map(mapping).fillna(df['game.home'])

def ncaab_get_deficit_time(df, game_code):
    box_score = ncaab_get_final_score(df, game_code)
    # print(box_score)
    winning_team = box_score['winner']

    df_game = df[df['game.code'] == game_code]
    df_winner = df_game[df_game['team.team'] == winning_team]
    # print(df_winner)
    
    deficit = min(pd.to_numeric(df_winner['thediff'], errors='coerce', downcast='integer'))
    df_deficit = df_winner[df_winner['thediff'] == deficit].tail(1)
    if len(df_deficit)<1:
         df_deficit = df_winner[df_winner['thediff'] == "+"+str(deficit)].tail(1)
    if len(df_deficit)<1:
        df_deficit = df_winner[df_winner['thediff'] == str(deficit)].tail(1)
    return {
        'game title': box_score['gameTitle'],
        'date': box_score['date'],
        'winning_team': winning_team,
        'deficit': deficit,
        'period': df_deficit.iloc[0]['game.period'],
        'clock': df_deficit.iloc[0]['game.time'],
        'seconds_remaining': ncaab_get_seconds_remaining(df_deficit.iloc[0]['game.period'], df_deficit.iloc[0]['game.time'])
    }

def ncaab_get_seconds_remaining(period, clock: str):
    try: 
        minutes, seconds, ms = clock.split(':')
    except Exception:
        minutes, seconds = clock.split(":")
        ms = 0
    if int(period) < 2:
        return (2-int(period))*20*60 + int(minutes)*60 + float(seconds) + float(ms)/100
    else:
        return int(minutes)*60 + float(seconds) + float(ms)/100

In [None]:
active_game_dates = ncaab_list_all_dates_for_season("MBB", 2024)
active_game_dates

In [None]:
len(active_game_dates)

In [None]:
# continue querying
already_queried_dates = set(df['game.gameday'])
remaining = active_game_dates - already_queried_dates
len(remaining)

In [None]:
df = pd.DataFrame()
df = pd.read_csv('outputs/full_filtered_ncaab_output.csv')

# for date in remaining:
#     print(f"Querying for day: {date}...")
#     single_day_pbp_df = ncaab_get_pbp_by_date("MBB", date)
#     df = pd.concat([df, single_day_pbp_df])
#     print("Successfully added day. Sleeping for 0s...")
# df.to_csv('outputs/full_ncaab_output.csv')

In [None]:
df = df[~df['tags'].str.contains('MISC', na=False)]
df.to_csv('outputs/full_filtered_ncaab_output.csv')

In [None]:
sorted(df['game.home'].unique().tolist())

In [None]:
print(len(ncaab_get_list_of_game_codes_from_single_day_pbp(df)))
df.columns

In [None]:
comebacks = []
for game in ncaab_get_list_of_game_codes_from_single_day_pbp(df):
    if game == 1258358:
        continue
    print(game)
    output = ncaab_get_deficit_time(df, game)
    comebacks.append(output)

In [None]:
comebacks_df = pd.DataFrame(comebacks)
exclude = [
    "Canisius Golden Griffins vs Wofford Terriers", 
    "Colorado Buffaloes vs Miami (Fla.) Hurricanes"
]
comebacks_df = comebacks_df[~comebacks_df['game title'].isin(exclude)]
comebacks_df['analytical_lead_minus_five_abs'] = comebacks_df['deficit'].apply(lambda x: min(x+5, 0))
comebacks_df['analytical_time_required'] = (comebacks_df['analytical_lead_minus_five_abs'])**2
comebacks_df['analytical_diff'] = comebacks_df['analytical_time_required'] - comebacks_df['seconds_remaining']
comebacks_df

In [None]:
comebacks_df.to_csv('outputs/ncaab_comebacks_v1.csv')

## Investigation

In [None]:
df_error = df[df['game.code'] == "1258358"]
df_error['game.description']