# NBA Data Exploration - Play-by-Play Structure

Quick exploration of what data we have and how to use it for win probability modeling.

In [17]:
import json
import os
import pandas as pd
from glob import glob

## 1. What Files Do We Have?

In [18]:
# Check downloaded files
pbp_files = glob('./nba_data/pbp_*.json')
summary_files = glob('./nba_data/summary_*.json')

print(f"Play-by-play files: {len(pbp_files)}")
print(f"Summary files: {len(summary_files)}")
print(f"\nFirst few games:")
for f in pbp_files[:3]:
    print(f"  {os.path.basename(f)}")

Play-by-play files: 100
Summary files: 99

First few games:
  pbp_deea9000-6a18-4340-8084-e21d2ae854b0.json
  pbp_bb7154a0-c923-4a43-95a6-1c01ab13b52e.json
  pbp_b6da40ae-fed0-4ccb-b4e5-de2276d50cd8.json


## 2. Load One Game

In [19]:
# Load first game
with open(pbp_files[0], 'r') as f:
    pbp = json.load(f)

# Fix: Extract game_id correctly
game_id = os.path.basename(pbp_files[0]).replace('pbp_', '').replace('.json', '')

with open(f'./nba_data/summary_{game_id}.json', 'r') as f:
    summary = json.load(f)

print("Game Info:")
print(f"Home: {pbp['home']['name']}")
print(f"Away: {pbp['away']['name']}")
print(f"Final Score: {summary['home']['points']} - {summary['away']['points']}")

Game Info:
Home: Bucks
Away: Bulls
Final Score: 122 - 133


## 3. Play-by-Play Structure

In [20]:
# What's in the play-by-play?
print("Top-level keys:")
print(list(pbp.keys()))

print(f"\nNumber of periods: {len(pbp['periods'])}")

# First period structure
period1 = pbp['periods'][0]
print(f"\nPeriod 1 keys: {list(period1.keys())}")
print(f"Events in period 1: {len(period1['events'])}")

Top-level keys:
['id', 'status', 'coverage', 'scheduled', 'duration', 'attendance', 'lead_changes', 'times_tied', 'clock', 'quarter', 'track_on_court', 'reference', 'entry_mode', 'sr_id', 'clock_decimal', 'broadcasts', 'time_zones', 'season', 'home', 'away', 'periods', 'deleted_events']

Number of periods: 4

Period 1 keys: ['type', 'id', 'number', 'sequence', 'scoring', 'events']
Events in period 1: 119


## 4. Sample Events

In [21]:
# Look at first 10 events
events = period1['events'][:10]

event_data = []
for e in events:
    event_data.append({
        'clock': e.get('clock', ''),
        'type': e.get('event_type', ''),
        'description': e.get('description', ''),
        'points': e.get('points', 0)
    })

df_events = pd.DataFrame(event_data)
print("First 10 events:")
print(df_events)

First 10 events:
   clock            type                                        description  \
0  12:00    lineupchange  Bulls lineup change (Coby White, Patrick Willi...   
1  12:00    lineupchange  Bucks lineup change (Giannis Antetokounmpo, Br...   
2  12:00         opentip  Nikola Vucevic vs. Brook Lopez (Giannis Anteto...   
3  11:42  threepointmiss        Damian Lillard misses three point jump shot   
4  11:38         rebound                 Patrick Williams defensive rebound   
5  11:28    twopointmade  Josh Giddey makes two point floating jump shot...   
6  11:15  threepointmade  Taurean Prince makes three point jump shot (Ga...   
7  11:01    twopointmiss  Patrick Williams misses two point turnaround f...   
8  10:58         rebound                   Taurean Prince defensive rebound   
9  10:45    twopointmiss   Gary Trent Jr. misses two point pullup jump shot   

   points  
0       0  
1       0  
2       0  
3       0  
4       0  
5       0  
6       0  
7       0  
8    

## 5. Event Types Available

In [22]:
# Get all event types across all periods
event_types = set()
for period in pbp['periods']:
    for event in period['events']:
        event_types.add(event.get('event_type', 'unknown'))

print("Available event types:")
for et in sorted(event_types):
    print(f"  - {et}")

Available event types:
  - delay
  - endperiod
  - freethrowmade
  - freethrowmiss
  - lineupchange
  - opentip
  - personalfoul
  - rebound
  - review
  - shootingfoul
  - stoppage
  - teamtimeout
  - technicalfoulnonunsportsmanlike
  - threepointmade
  - threepointmiss
  - turnover
  - twopointmade
  - twopointmiss


## 6. Key Data Points for Win Probability

In [23]:
# Example: Extract game state at a specific moment
def extract_game_state(pbp, period_idx=2, event_idx=50):
    """Extract game state at a specific moment"""
    
    period = pbp['periods'][period_idx]
    event = period['events'][event_idx]
    
    # Calculate scores up to this point
    home_score = 0
    away_score = 0
    
    for p_idx in range(period_idx + 1):
        events = pbp['periods'][p_idx]['events']
        end_idx = event_idx if p_idx == period_idx else len(events)
        
        for e in events[:end_idx]:
            if e.get('event_type') == 'fieldgoalmade':
                points = e.get('points', 0)
                team_id = e.get('attribution', {}).get('team', {}).get('id')
                if team_id == pbp['home']['id']:
                    home_score += points
                else:
                    away_score += points
            elif e.get('event_type') == 'freethrow' and e.get('made'):
                team_id = e.get('attribution', {}).get('team', {}).get('id')
                if team_id == pbp['home']['id']:
                    home_score += 1
                else:
                    away_score += 1
    
    return {
        'quarter': period.get('number'),
        'clock': event.get('clock'),
        'home_score': home_score,
        'away_score': away_score,
        'score_diff': home_score - away_score,
    }

# Test it
state = extract_game_state(pbp, period_idx=2, event_idx=50)
print("Game state at Q3, event 50:")
print(state)

Game state at Q3, event 50:
{'quarter': 3, 'clock': '5:59', 'home_score': 0, 'away_score': 0, 'score_diff': 0}


## 7. What We Need for Training

In [24]:
print("Key features to extract for each game moment:")
print("\n1. GAME STATE")
print("   - Quarter (1-4)")
print("   - Time remaining")
print("   - Score differential")
print("   - Timeouts remaining")

print("\n2. MOMENTUM")
print("   - Points in last 10 possessions (each team)")
print("   - Recent FG percentage")

print("\n3. OUTCOME (TARGET)")
print("   - Did home team win? (1/0)")

print("\n4. SAMPLING STRATEGY")
print("   - Sample every ~5 events or every possession change")
print("   - Creates ~100 snapshots per game")
print(f"   - {len(pbp_files)} games × 100 snapshots = ~{len(pbp_files) * 100} training examples")

Key features to extract for each game moment:

1. GAME STATE
   - Quarter (1-4)
   - Time remaining
   - Score differential
   - Timeouts remaining

2. MOMENTUM
   - Points in last 10 possessions (each team)
   - Recent FG percentage

3. OUTCOME (TARGET)
   - Did home team win? (1/0)

4. SAMPLING STRATEGY
   - Sample every ~5 events or every possession change
   - Creates ~100 snapshots per game
   - 100 games × 100 snapshots = ~10000 training examples


## 8. Teams in Dataset

In [25]:
# Check structure of first file
with open(pbp_files[0], 'r') as f:
    data = json.load(f)
    
print("Top-level keys:")
print(list(data.keys()))
print("\nFirst few key-value pairs:")
for key in list(data.keys())[:5]:
    print(f"{key}: {type(data[key])}")

Top-level keys:
['id', 'status', 'coverage', 'scheduled', 'duration', 'attendance', 'lead_changes', 'times_tied', 'clock', 'quarter', 'track_on_court', 'reference', 'entry_mode', 'sr_id', 'clock_decimal', 'broadcasts', 'time_zones', 'season', 'home', 'away', 'periods', 'deleted_events']

First few key-value pairs:
id: <class 'str'>
status: <class 'str'>
coverage: <class 'str'>
scheduled: <class 'str'>
duration: <class 'str'>


In [26]:
# Get all teams with better error handling
teams = set()
problem_files = []

for pbp_file in pbp_files:
    try:
        with open(pbp_file, 'r') as f:
            data = json.load(f)
            
        if 'home' in data and 'away' in data:
            teams.add(data['home']['name'])
            teams.add(data['away']['name'])
        else:
            problem_files.append(pbp_file)
            
    except (KeyError, json.JSONDecodeError) as e:
        print(f"Error in {os.path.basename(pbp_file)}: {e}")
        problem_files.append(pbp_file)

print(f"\nTeams in dataset ({len(teams)} total):")
for team in sorted(teams):
    print(f"  - {team}")

if problem_files:
    print(f"\nProblem files ({len(problem_files)}):")
    for f in problem_files:
        print(f"  - {os.path.basename(f)}")


Teams in dataset (30 total):
  - 76ers
  - Bucks
  - Bulls
  - Cavaliers
  - Celtics
  - Clippers
  - Grizzlies
  - Hawks
  - Heat
  - Hornets
  - Jazz
  - Kings
  - Knicks
  - Lakers
  - Magic
  - Mavericks
  - Nets
  - Nuggets
  - Pacers
  - Pelicans
  - Pistons
  - Raptors
  - Rockets
  - Spurs
  - Suns
  - Thunder
  - Timberwolves
  - Trail Blazers
  - Warriors
  - Wizards

Problem files (2):
  - pbp_a06b10fa-fd80-4058-9e0a-d3d1d69cb6f1.json
  - pbp_28dfca06-a3e0-43f5-a841-53fb7d75cc00.json


## 9. Quick Stats

In [27]:
# Game statistics
total_events = 0
total_periods = 0
valid_games = 0

for pbp_file in pbp_files:
    try:
        with open(pbp_file, 'r') as f:
            data = json.load(f)
            
        if 'periods' in data:
            for period in data['periods']:
                total_events += len(period.get('events', []))
                total_periods += 1
            valid_games += 1
        else:
            print(f"No periods in: {os.path.basename(pbp_file)}")
            
    except Exception as e:
        print(f"Error in {os.path.basename(pbp_file)}: {e}")

if valid_games > 0:
    print(f"\nDataset Statistics:")
    print(f"  Valid games: {valid_games}/{len(pbp_files)}")
    print(f"  Total periods: {total_periods}")
    print(f"  Total events: {total_events:,}")
    print(f"  Avg events per game: {total_events / valid_games:.0f}")
    print(f"  Estimated training snapshots: ~{valid_games * 100:,}")
else:
    print("\nNo valid games found!")

No periods in: pbp_a06b10fa-fd80-4058-9e0a-d3d1d69cb6f1.json
No periods in: pbp_28dfca06-a3e0-43f5-a841-53fb7d75cc00.json

Dataset Statistics:
  Valid games: 98/100
  Total periods: 399
  Total events: 47,603
  Avg events per game: 486
  Estimated training snapshots: ~9,800


---

## Summary

**What we have:**
- Play-by-play events (shots, fouls, turnovers, etc.)
- Game clock and quarter info
- Team and player attribution
- Final outcomes

**Next steps:**
1. Process all games into snapshots (every ~5 events)
2. Extract features: score, time, momentum
3. Train XGBoost model on `home_won` target

**Model approach:**
- Input: Game state (score, time, momentum)
- Output: Probability home team wins
- Model: XGBoost classifier
- Metric: Log loss (probability calibration)