In [4]:
import pandas as pd
import requests
import time
import os

index_frame = pd.read_csv('index_master.csv')
filtered_df = index_frame[index_frame['year'] >= 2017]

# Find player-year combinations with exactly one team
unique_teams = (
    filtered_df.groupby(['nba_id', 'year'])['team_id']
    .nunique()
    .reset_index()
    .rename(columns={'team_id': 'team_count'})
)
single_team_players = unique_teams[unique_teams['team_count'] == 1]
result_df = pd.merge(filtered_df, single_team_players[['nba_id', 'year']], on=['nba_id', 'year'])
player_team_map = result_df[['nba_id', 'team_id', 'year']].drop_duplicates()
player_team_map = player_team_map.rename(columns={'nba_id': 'player_id', 'year': 'season_end_year'})
def pull_data(url):
    headers = {
        "Host": "stats.nba.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer": "https://stats.nba.com/"
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    json = response.json()

    if len(json["resultSets"]) == 1:
        data = json["resultSets"][0]["rowSet"]
        columns = json["resultSets"][0]["headers"]
    else:
        data = json["resultSets"]["rowSet"]
        columns = json["resultSets"]["headers"][1]['columnNames']
    
    return pd.DataFrame.from_records(data, columns=columns)

def get_matchups_for_date(season, date, mode='Offense', is_playoffs=False):
    stype = 'Playoffs' if is_playoffs else 'Regular%20Season'
    url = (
        'https://stats.nba.com/stats/leagueseasonmatchups?'
        f'DateFrom={date}&DateTo={date}&DefPlayerID=&DefTeamID=&LeagueID=00'
        f'&Matchup={mode}&OffPlayerID=&OffTeamID=&Outcome=&PORound=0&PerMode=Totals'
        f'&Season={season}&SeasonType={stype}'
    )
    return pull_data(url)

# === Pipeline Start ===
df = pd.read_csv("game_dates.csv")
df['season_end_year'] = df['season'].str.split('-').str[0].astype(int) + 1
filtered_df = df[df['season_end_year'] >= 2018]
unique_dates = filtered_df[['date', 'season', 'season_end_year', 'playoffs']].drop_duplicates()
unique_dates=unique_dates.head(1)

mode = 'Defense'  # Change to 'Defense' if needed

# Optional: create output folder
output_dir = 'matchup_outputs'
os.makedirs(output_dir, exist_ok=True)

grouped = unique_dates.groupby('season_end_year')
for year, group in grouped:
    filename = os.path.join(output_dir, f"matchups_{mode.lower()}_{year}.csv")

    # Step 1: Check for existing file and load scraped dates
    if os.path.exists(filename):
        existing_df = pd.read_csv(filename)
        scraped_dates = set(existing_df['game_date'].unique())
        print(f"📁 Found existing file for {year} with {len(scraped_dates)} dates already scraped.")
    else:
        scraped_dates = set()

    # Step 2: Filter group to exclude scraped dates
    group_to_scrape = group[~group['date'].isin(scraped_dates)]

    if group_to_scrape.empty:
        print(f"⏭️  All dates already scraped for {year}. Skipping.")
        continue

    all_matchups = []

    for _, row in group_to_scrape.iterrows():
        date = row['date']
        season = row['season']
        is_playoffs = row['playoffs']

        print(f"📅 Pulling {mode} matchup for {date} ({'Playoffs' if is_playoffs else 'Regular Season'})")

        try:
            matchups_df = get_matchups_for_date(season, date, mode=mode, is_playoffs=is_playoffs)
            print(matchups_df.GP)
            matchups_df['season_end_year']=2018
            def_map = player_team_map.rename(columns={'player_id': 'DEF_PLAYER_ID', 'team_id': 'DEF_TEAM_ID'})
            matchups_df = pd.merge(matchups_df, def_map, on=['DEF_PLAYER_ID', 'season_end_year'], how='left')
            def_known = matchups_df[['DEF_PLAYER_ID', 'season_end_year', 'DEF_TEAM_ID']].dropna().drop_duplicates()
            matchusp_df = pd.merge(
                matchups_df.drop(columns=['DEF_TEAM_ID']),
                def_known,
                on=['DEF_PLAYER_ID', 'season_end_year'],
                how='left'
            )
            print(matchups_df.columns)
            matchups_df['game_date'] = date
            matchups_df['season'] = season
            matchups_df['mode'] = mode
            matchups_df['playoffs'] = is_playoffs
            all_matchups.append(matchups_df)
        except Exception as e:
            print(f"❌ Error on {date} ({season}) - {e}")

        time.sleep(0.3)  # Rate limiting

    if all_matchups:
        new_df = pd.concat(all_matchups, ignore_index=True)
        new_df.drop_duplicates(inplace=True)

        # Step 3: Append to existing file if it exists
        if os.path.exists(filename):
            final_df = pd.concat([existing_df, new_df], ignore_index=True).drop_duplicates()
        else:
            final_df = new_df

        final_df.to_csv(filename, index=False)
        print(f"✅ Saved/updated file for {year}: {filename}")
    else:
        print(f"⚠️ No new data collected for {year}.")

print("\n✅ Done. Matchups saved.")

📅 Pulling Defense matchup for 20171017 (Regular Season)
Index(['SEASON_ID', 'OFF_PLAYER_ID', 'OFF_PLAYER_NAME', 'DEF_PLAYER_ID',
       'DEF_PLAYER_NAME', 'GP', 'MATCHUP_MIN', 'PARTIAL_POSS', 'PLAYER_PTS',
       'TEAM_PTS', 'MATCHUP_AST', 'MATCHUP_TOV', 'MATCHUP_BLK', 'MATCHUP_FGM',
       'MATCHUP_FGA', 'MATCHUP_FG_PCT', 'MATCHUP_FG3M', 'MATCHUP_FG3A',
       'MATCHUP_FG3_PCT', 'HELP_BLK', 'HELP_FGM', 'HELP_FGA', 'HELP_FG_PERC',
       'MATCHUP_FTM', 'MATCHUP_FTA', 'SFL', 'MATCHUP_TIME_SEC',
       'season_end_year', 'DEF_TEAM_ID'],
      dtype='object')
✅ Saved/updated file for 2018: matchup_outputs/matchups_defense_2018.csv

✅ Done. Matchups saved.
