In [1]:
import pandas as pd
import requests
import os
from nba_api.stats.static import teams
import io

def get_dates(start_year=2025, end_year=2026):
    dates = []
    for year in range(start_year, end_year):
        for team in teams.get_teams():
            team_id = team['id']
            path = f'../team/{year}ps/{team_id}.csv'
            if os.path.exists(path):
                df = pd.read_csv(path)
                if {'PLAYER_ID', 'HTM', 'VTM', 'GAME_DATE', 'GAME_ID'}.issubset(df.columns):
                    df = df[['PLAYER_ID', 'HTM', 'VTM', 'GAME_DATE', 'GAME_ID']]
                    df['year'] = year
                    df.drop_duplicates(inplace=True)
                    dates.append(df)
    return pd.concat(dates).drop_duplicates(subset='GAME_ID')

def fetch_game_csvs(dateframe, save_dir='game_data'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    all_game_data = []
    for _, row in dateframe.iterrows():
        year = pd.to_datetime(row['GAME_DATE']).year
        game_id = row['GAME_ID']
        url = f'https://raw.githubusercontent.com/gabriel1200/player_sheets/refs/heads/master/game_report/{year}/{game_id}.csv'
        
        try:
            response = requests.get(url)
            response.raise_for_status()
            df = pd.read_csv(io.StringIO(response.text))
            df['GAME_ID'] = game_id
            df['date'] = row['GAME_DATE']
            df['HTM'] = row['HTM']
            df['VTM'] = row['VTM']
            df['year'] = row['year']
            all_game_data.append(df)

            # Save raw CSV
            with open(os.path.join(save_dir, f'{year}_{game_id}.csv'), 'w', encoding='utf-8') as f:
                f.write(response.text)

        except requests.HTTPError as e:
            print(f'Failed to fetch {url}: {e}')
        except Exception as e:
            print(f'Unexpected error for {game_id}: {e}')
    
    return pd.concat(all_game_data, ignore_index=True) if all_game_data else pd.DataFrame()

def process_and_save_series_data(df, dateframe):
    df = df.merge(dateframe, how='left', on='GAME_ID').reset_index(drop=True)

    home = df[df.HTM == df.TEAM_ABBREVIATION].copy()
    home.drop(columns='HTM', inplace=True)
    home.rename(columns={'VTM': 'opp_team'}, inplace=True)

    away = df[df.VTM == df.TEAM_ABBREVIATION].copy()
    away.drop(columns='VTM', inplace=True)
    away.rename(columns={'HTM': 'opp_team'}, inplace=True)

    none = df[df.HTM.isna()].copy().reset_index(drop=True)

    df = pd.concat([home, away, none], ignore_index=True)
    oppframe = df[['TEAM_ID', 'date', 'opp_team']].dropna(subset=['opp_team']).drop_duplicates()

    df.drop(columns='opp_team', inplace=True)
    df = df.merge(oppframe, on=['TEAM_ID', 'date'], how='left')

    df['team'] = df['TEAM_ABBREVIATION']
    teammap = dict(zip(df['TEAM_ABBREVIATION'], df['TEAM_ID']))

    player_index = df[['PLAYER_NAME', 'PLAYER_ID', 'team', 'TEAM_ID', 'opp_team', 'year']].copy()
    player_index['opp_id'] = player_index['opp_team'].map(teammap)
    player_index.drop_duplicates(inplace=True)
    player_index.to_csv('series_index_players.csv', index=False)

    df = df.dropna(subset=['opp_team'])
    teammap = dict(zip(df['TEAM_ABBREVIATION'], df['TEAM_ID']))
    df['opp_id'] = df['opp_team'].map(teammap)

    df['team'] = df['TEAM_ABBREVIATION']
    df.sort_values(by='date', inplace=True)
    df['series_key'] = df['team'] + '_' + df['opp_team'] + '_' + df['year'].astype(str)

    df.to_csv('playoff_data.csv', index=False)

    # === Save each unique series_key ===
    series_dir = '../series/data'
    os.makedirs(series_dir, exist_ok=True)

    for key, group in df.groupby('series_key'):
        safe_key = key.replace('/', '-').replace('\\', '-')  # Just in case of any illegal characters
        group.to_csv(os.path.join(series_dir, f'{safe_key}.csv'), index=False)

    return df


# === Run Full Process ===
start_year = 2025
end_year = 2026

dates = get_dates(start_year, end_year)
raw_df = fetch_game_csvs(dates)
processed_df = process_and_save_series_data(raw_df, dates)



Index(['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME',
       'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING',
       'SECONDS_REMAINING', 'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE',
       'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE',
       'LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE',
       'HTM', 'VTM', 'SHOT_ID', 'time', 'extra', 'PLAYERS_ON', 'assisted'],
      dtype='object')
Index(['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME',
       'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING',
       'SECONDS_REMAINING', 'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE',
       'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE',
       'LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE',
       'HTM', 'VTM', 'SHOT_ID', 'time', 'extra', 'PLAYERS_ON', 'assisted'],
      dtype='object')
Index(['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_N