In [18]:
import pandas as pd
import requests
import os
from nba_api.stats.static import teams
import io

def get_dates(start_year=2025, end_year=2026):
    dates = []
    print(f"DEBUG: Looking for data from {start_year} to {end_year-1}")
    
    for year in range(start_year, end_year):
        print(f"DEBUG: Processing year {year}")
        year_files_found = 0
        
        for team in teams.get_teams():
            team_id = team['id']
            team_name = team['full_name']
            path = f'../team/{year}ps/{team_id}.csv'
            
            if os.path.exists(path):
                year_files_found += 1
                print(f"DEBUG: Found file for {team_name} ({team_id}): {path}")
                
                try:
                    df = pd.read_csv(path)
                    print(f"DEBUG: File shape: {df.shape}, Columns: {list(df.columns)}")
                    
                    # Check if required columns exist
                    required_cols = {'PLAYER_ID', 'HTM', 'VTM', 'GAME_DATE', 'GAME_ID'}
                    if required_cols.issubset(df.columns):
                        df = df[['PLAYER_ID', 'HTM', 'VTM', 'GAME_DATE', 'GAME_ID']]
                        df['year'] = year
                        df.drop_duplicates(inplace=True)
                        print(f"DEBUG: Added {len(df)} unique games from {team_name}")
                        dates.append(df)
                    else:
                        missing_cols = required_cols - set(df.columns)
                        print(f"DEBUG: Missing columns in {team_name} file: {missing_cols}")
                        
                except Exception as e:
                    print(f"DEBUG: Error reading {path}: {e}")
            else:
                # Only print for first few teams to avoid spam
                if team_id <= 1610612739:  # First few team IDs
                    print(f"DEBUG: File not found for {team_name} ({team_id}): {path}")
        
        print(f"DEBUG: Year {year} - Found {year_files_found} team files")
    
    if dates:
        combined_df = pd.concat(dates).drop_duplicates(subset='GAME_ID')
        print(f"DEBUG: Total unique games found: {len(combined_df)}")
        print(f"DEBUG: Date range: {combined_df['GAME_DATE'].min()} to {combined_df['GAME_DATE'].max()}")
        return combined_df
    else:
        print("DEBUG: No valid data files found")
        return pd.DataFrame()

def fetch_game_csvs(dateframe, save_dir='game_data'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    all_game_data = []
    successful_fetches = 0
    failed_fetches = 0
    
    print(f"DEBUG: Attempting to fetch {len(dateframe)} games")
    
    for idx, (_, row) in enumerate(dateframe.iterrows()):
        year = int(row['year'])
        game_id = row['GAME_ID']
        url = f'https://raw.githubusercontent.com/gabriel1200/player_sheets/refs/heads/master/game_report/{year}/{game_id}.csv'
        
        # Print progress every 10 games
        if idx % 10 == 0:
            print(f"DEBUG: Processing game {idx+1}/{len(dateframe)}")
        
        print(f"DEBUG: Fetching {url}")
        
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            
            # Check if response is actually CSV data
            if len(response.text.strip()) == 0:
                print(f"DEBUG: Empty response for {game_id}")
                failed_fetches += 1
                continue
                
            df = pd.read_csv(io.StringIO(response.text))
            print(f"DEBUG: Successfully loaded CSV with {len(df)} rows, {len(df.columns)} columns")
            
            df['GAME_ID'] = game_id
            df['date'] = row['GAME_DATE']
            df['HTM'] = row['HTM']
            df['VTM'] = row['VTM']
            df['year'] = year
            all_game_data.append(df)
            successful_fetches += 1

            # Save raw CSV locally
            filename = os.path.join(save_dir, f'{year}_{game_id}.csv')
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"DEBUG: Saved to {filename}")

        except requests.HTTPError as e:
            print(f'DEBUG: HTTP Error for {url}: {e} (Status: {e.response.status_code if hasattr(e, "response") else "Unknown"})')
            failed_fetches += 1
        except requests.Timeout:
            print(f'DEBUG: Timeout for {url}')
            failed_fetches += 1
        except Exception as e:
            print(f'DEBUG: Unexpected error for {game_id}: {type(e).__name__}: {e}')
            failed_fetches += 1
    
    print(f"DEBUG: Fetch summary - Success: {successful_fetches}, Failed: {failed_fetches}")
    
    if all_game_data:
        combined_df = pd.concat(all_game_data, ignore_index=True)
        print(f"DEBUG: Combined dataframe shape: {combined_df.shape}")
        return combined_df
    else:
        print("DEBUG: No game data successfully fetched")
        return pd.DataFrame()

def process_and_save_series_data(df, dateframe):
    print(f"DEBUG: Processing dataframe with {len(df)} rows")
    
    # Merge HTM, VTM, GAME_DATE, year from dateframe
    df.rename(columns={'GAME_DATE': 'date'}, inplace=True)
    if 'opp_team' in df.columns:
        df.drop(columns='opp_team',inplace=True)

    # Debug team abbreviations
    print(f"DEBUG: Unique team abbreviations in data: {sorted(df['TEAM_ABBREVIATION'].unique())}")
    print(f"DEBUG: Unique HTM values: {sorted(df['HTM'].dropna().unique())}")
    print(f"DEBUG: Unique VTM values: {sorted(df['VTM'].dropna().unique())}")

    # Home/away split using merged HTM/VTM
    home = df[df.HTM == df.TEAM_ABBREVIATION].copy()
    away = df[df.VTM == df.TEAM_ABBREVIATION].copy()
    none = df[df.HTM.isna()].copy().reset_index(drop=True)
    
    print(f"DEBUG: Home games: {len(home)}, Away games: {len(away)}, Games with no HTM: {len(none)}")
    
    home.drop(columns='HTM', inplace=True)
    home.rename(columns={'VTM': 'opp_team'}, inplace=True)

    away.drop(columns='VTM', inplace=True)
    away.rename(columns={'HTM': 'opp_team'}, inplace=True)

    home.drop_duplicates(inplace=True)
    away.drop_duplicates(inplace=True)
    
    frames = [home, away]
    if len(none) > 0:
        frames.append(none)
    df = pd.concat(frames, ignore_index=True)

    print(f"DEBUG: After home/away split: {len(df)} rows")

    # Map opp_team from date/team_id
    oppframe = df[['TEAM_ID', 'date', 'opp_team']].dropna(subset=['opp_team']).drop_duplicates()
    print(f"DEBUG: Opponent mapping frame: {len(oppframe)} rows")
    print(oppframe.head())
    
    df.drop(columns='opp_team', inplace=True)
    df = df.merge(oppframe, on=['TEAM_ID', 'date'], how='left')

    df['team'] = df['TEAM_ABBREVIATION']
    teammap = dict(zip(df['TEAM_ABBREVIATION'], df['TEAM_ID']))

    print(f"DEBUG: Team mapping created with {len(teammap)} teams")
    print('Printing Team Map')
    print(teammap)
    # Build current player index
    player_index = df[['PLAYER_NAME', 'PLAYER_ID', 'team', 'TEAM_ID', 'opp_team', 'year']].copy()
    player_index['opp_id'] = player_index['opp_team'].map(teammap)
    player_index.drop_duplicates(inplace=True)

    print(f"DEBUG: Player index created with {len(player_index)} unique entries")

    # Merge with existing index file if it exists
    index_path = 'series_index_players.csv'
    if os.path.exists(index_path):
        existing_index = pd.read_csv(index_path)
        print(f"DEBUG: Found existing player index with {len(existing_index)} rows")
        combined = pd.concat([existing_index, player_index], ignore_index=True)
        combined.drop_duplicates(
            subset=['PLAYER_ID', 'team', 'TEAM_ID', 'opp_team', 'year'], keep='last', inplace=True
        )
        combined.to_csv(index_path, index=False)
        print(f"DEBUG: Updated player index saved with {len(combined)} rows")
    else:
        player_index.to_csv(index_path, index=False)
        print(f"DEBUG: New player index saved with {len(player_index)} rows")

    # Continue data cleanup
    df = df.dropna(subset=['opp_team'])
    print(f"DEBUG: After dropping rows with no opponent: {len(df)} rows")
    print(f"DEBUG: Unique opponents: {sorted(df.opp_team.unique())}")

    df['opp_id'] = df['opp_team'].map(teammap)
    df.sort_values(by='date', inplace=True)
    df['series_key'] = df['team'] + '_' + df['opp_team'] + '_' + df['year'].astype(str)
    
    unique_series = df['series_key'].nunique()
    print(f"DEBUG: Created {unique_series} unique series")
    
    series_index = df[['series_key', 'team', 'opp_team', 'TEAM_ID','opp_id', 'year']]
    series_index.drop_duplicates(inplace=True)
    
    series_index_path = 'series_index.csv'
    if os.path.exists(series_index_path):
        existing_series_index = pd.read_csv(series_index_path)
        print(f"DEBUG: Found existing series index with {len(existing_series_index)} rows")
        combined_series_index = pd.concat([existing_series_index, series_index], ignore_index=True)
        combined_series_index.drop_duplicates(
            subset=['series_key', 'team', 'TEAM_ID', 'opp_team', 'opp_id', 'year'], keep='last', inplace=True
        )
        combined_series_index.to_csv(series_index_path, index=False)
        print(f"DEBUG: Updated series index saved with {len(combined_series_index)} rows")
    else:
        series_index.to_csv(series_index_path, index=False)
        print(f"DEBUG: New series index saved with {len(series_index)} rows")
    
    df.to_csv('playoff_data.csv', index=False)
    print(f"DEBUG: Main playoff data saved with {len(df)} rows")

    # Save series-specific dataframes
    series_dir = '../series/data'
    os.makedirs(series_dir, exist_ok=True)
    
    series_count = 0
    for key, group in df.groupby('series_key'):
        safe_key = key.replace('/', '-').replace('\\', '-')  # sanitize for filesystem
        filename = os.path.join(series_dir, f'{safe_key}.csv')
        group.to_csv(filename, index=False)
        series_count += 1
        if series_count <= 5:  # Only print first 5 to avoid spam
            print(f"DEBUG: Saved series {safe_key} with {len(group)} rows to {filename}")
    
    print(f"DEBUG: Saved {series_count} individual series files")
    return df

def check_directory_structure():
    """Check what years and team directories actually exist"""
    print("DEBUG: Checking directory structure...")
    
    base_path = '../team'
    if not os.path.exists(base_path):
        print(f"DEBUG: Base path {base_path} does not exist!")
        return
    
    # Check what year directories exist
    year_dirs = []
    for item in os.listdir(base_path):
        item_path = os.path.join(base_path, item)
        if os.path.isdir(item_path):
            year_dirs.append(item)
    
    print(f"DEBUG: Available year directories: {sorted(year_dirs)}")
    
    # Check each year directory
    for year_dir in sorted(year_dirs):
        year_path = os.path.join(base_path, year_dir)
        files = [f for f in os.listdir(year_path) if f.endswith('.csv')]

# === Enhanced Run Pipeline ===
def run_pipeline(start_year=2024, end_year=2025):
    print("="*50)
    print("STARTING NBA DATA PIPELINE DEBUG")
    print("="*50)
    
    # First check directory structure
    check_directory_structure()
    
    print(f"\nAttempting to process years {start_year} to {end_year-1}")
    
    dates = get_dates(start_year, end_year)
    if dates.empty:
        print(f'ERROR: No game date data found for {start_year}-{end_year - 1}.')
        print("This could be because:")
        print("1. The year directories don't exist")
        print("2. The CSV files don't exist in those directories")
        print("3. The CSV files exist but don't have the required columns")
        print("4. The path structure is different than expected")
        return None
    else:
        print(f"SUCCESS: Found game dates for {len(dates)} games")
        raw_df = fetch_game_csvs(dates)
        if raw_df.empty:
            print('ERROR: No game CSVs successfully fetched.')
            print("This could be because:")
            print("1. The GitHub repository structure is different for older years")
            print("2. The game IDs have a different format")
            print("3. Network connectivity issues")
            print("4. The files don't exist in the repository for those years")
            return None
        else:
            print(f"SUCCESS: Fetched {len(raw_df)} rows of game data")
            processed_df = process_and_save_series_data(raw_df, dates)
            print("PIPELINE COMPLETED SUCCESSFULLY")
            return processed_df

# Run with different years to test
if __name__ == "__main__":
    # Test with 2025 first (working)
    print("Testing 2025 (should work):")
    result_2025 = run_pipeline(2025, 2026)
    
    print("\n" + "="*80 + "\n")
    
    # Test with 2024 (not working)
    print("Testing 2024 (problematic):")
    result_2024 = run_pipeline(2024, 2025)
    
    print("\n" + "="*80 + "\n")
    
    # Test with 2023 (not working)
    print("Testing 2023 (problematic):")
    result_2023 = run_pipeline(2023, 2024)



Testing 2025 (should work):
STARTING NBA DATA PIPELINE DEBUG
DEBUG: Checking directory structure...
DEBUG: Available year directories: ['1997', '1997ps', '1998', '1998ps', '1999', '1999ps', '2000', '2000ps', '2001', '2001ps', '2002', '2002ps', '2003', '2003ps', '2004', '2004ps', '2005', '2005ps', '2006', '2006ps', '2007', '2007ps', '2008', '2008ps', '2009', '2009ps', '2010', '2010ps', '2011', '2011ps', '2012', '2012ps', '2013', '2013ps', '2014', '2014ps', '2015', '2015ps', '2016', '2016ps', '2017', '2017ps', '2018', '2018ps', '2019', '2019ps', '2020', '2020ps', '2021', '2021ps', '2022', '2022ps', '2023', '2023ps', '2024', '2024ps', '2025', '2025ps']
DEBUG: 1997 contains 59 CSV files
DEBUG: Sample files in 1997: ['1610612741vs.csv', '1610612764vs.csv', '1610612751vs.csv']
DEBUG: 1997ps contains 33 CSV files
DEBUG: Sample files in 1997ps: ['1610612741vs.csv', '1610612764vs.csv', '1610612765.csv']
DEBUG: 1998 contains 59 CSV files
DEBUG: Sample files in 1998: ['1610612741vs.csv', '1610612

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series_index.drop_duplicates(inplace=True)


DEBUG: Main playoff data saved with 1645 rows
DEBUG: Saved series BOS_NYK_2025 with 64 rows to ../series/data/BOS_NYK_2025.csv
DEBUG: Saved series BOS_ORL_2025 with 49 rows to ../series/data/BOS_ORL_2025.csv
DEBUG: Saved series CLE_IND_2025 with 54 rows to ../series/data/CLE_IND_2025.csv
DEBUG: Saved series CLE_MIA_2025 with 51 rows to ../series/data/CLE_MIA_2025.csv
DEBUG: Saved series DEN_LAC_2025 with 71 rows to ../series/data/DEN_LAC_2025.csv
DEBUG: Saved 28 individual series files
PIPELINE COMPLETED SUCCESSFULLY


Testing 2024 (problematic):
STARTING NBA DATA PIPELINE DEBUG
DEBUG: Checking directory structure...
DEBUG: Available year directories: ['1997', '1997ps', '1998', '1998ps', '1999', '1999ps', '2000', '2000ps', '2001', '2001ps', '2002', '2002ps', '2003', '2003ps', '2004', '2004ps', '2005', '2005ps', '2006', '2006ps', '2007', '2007ps', '2008', '2008ps', '2009', '2009ps', '2010', '2010ps', '2011', '2011ps', '2012', '2012ps', '2013', '2013ps', '2014', '2014ps', '2015', '2015ps

ValueError: Grouper for 'opp_team' not 1-dimensional