# NBA 2024-25: Slump Shots & Recovery Shots
## Notebook 02: Data Cleaning & Preparation
This notebook focuses on cleaning and preparing the raw shot-level data for analysis. The result of this notebook is a clean, analysis-ready dataset where each row represents a single shot attempt.

In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", 160)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")

---
## Load Data

In [6]:
# Load shot-level data from 2024-25 regular season
shots = pd.read_parquet(r"...\03_python_outputs\nba_2024_25_shot_level_data_raw.parquet")

In [7]:
# Inspect structure
shots.head(5)

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,EVENT_TYPE,ACTION_TYPE,SHOT_TYPE,SHOT_ZONE_BASIC,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM
0,Shot Chart Detail,22400001,7,1642258,Zaccharie Risacher,1610612737,Atlanta Hawks,1,11,43,Missed Shot,Jump Shot,3PT Field Goal,Above the Break 3,Left Side Center(LC),24+ ft.,26,-168,205,1,0,20241112,BOS,ATL
1,Shot Chart Detail,22400001,10,1630552,Jalen Johnson,1610612737,Atlanta Hawks,1,11,38,Missed Shot,Driving Floating Bank Jump Shot,2PT Field Goal,Mid-Range,Left Side(L),8-16 ft.,13,-136,-1,1,0,20241112,BOS,ATL
2,Shot Chart Detail,22400001,21,1630552,Jalen Johnson,1610612737,Atlanta Hawks,1,10,50,Made Shot,Jump Shot,3PT Field Goal,Above the Break 3,Right Side Center(RC),24+ ft.,25,157,203,1,1,20241112,BOS,ATL
3,Shot Chart Detail,22400001,34,1630811,Keaton Wallace,1610612737,Atlanta Hawks,1,9,47,Missed Shot,Jump Shot,3PT Field Goal,Above the Break 3,Left Side Center(LC),24+ ft.,25,-176,184,1,0,20241112,BOS,ATL
4,Shot Chart Detail,22400001,36,203991,Clint Capela,1610612737,Atlanta Hawks,1,9,44,Made Shot,Putback Layup Shot,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,2,-25,8,1,1,20241112,BOS,ATL


In [8]:
# Inspect columns
shots.columns

Index(['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING', 'SECONDS_REMAINING',
       'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE', 'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE', 'LOC_X', 'LOC_Y',
       'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE', 'HTM', 'VTM'],
      dtype='object')

---
## Sanity Checks

In [10]:
# Confirm: `SHOT_MADE_FLAG` is binary
shots["SHOT_MADE_FLAG"].isin([0,1]).all()

True

In [11]:
# Confirm: no duplicate rows
shots.duplicated().sum() == 0

True

In [12]:
# --- Confirm: shot order doesn't break within player-game ---
shots_sorted = shots.sort_values(
    ["GAME_ID", "PLAYER_ID", "PERIOD",
     "MINUTES_REMAINING", "SECONDS_REMAINING"],
    ascending=[True, True, True, False, False]
)

shots_sorted["shot_order"] = shots_sorted.groupby(["GAME_ID", "PLAYER_ID"]).cumcount() + 1

shots_sorted.loc[
    shots_sorted["PLAYER_NAME"].isin(["Stephen Curry"]),
    ["GAME_ID", "PERIOD", "MINUTES_REMAINING", "SECONDS_REMAINING", "shot_order"]
].head(20)

Unnamed: 0,GAME_ID,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,shot_order
51821,22400007,1,11,28,1
51826,22400007,1,9,14,2
51827,22400007,1,9,0,3
51832,22400007,1,6,22,4
51835,22400007,1,4,26,5
51837,22400007,1,4,7,6
51838,22400007,1,3,33,7
51842,22400007,1,2,14,8
51860,22400007,2,4,18,9
51867,22400007,2,0,8,10


- `shot_order` increasing ✅
- `MINUTES_REMAINING` and `SECONDS_REMAINING` both counting down until `PERIOD` ends ✅
- `PERIOD` increments when clock resets ✅

In [14]:
# Confirm: location bounds
shots[["LOC_X", "LOC_Y"]].describe()

Unnamed: 0,LOC_X,LOC_Y
count,219527.0,219527.0
mean,-2.25,97.45
std,116.72,96.63
min,-250.0,-52.0
25%,-53.0,15.0
50%,0.0,57.0
75%,47.0,186.0
max,250.0,842.0


In [15]:
# --- Confirm: distance consistency ---
shots["calc_dist"] = np.sqrt(shots["LOC_X"]**2 + shots["LOC_Y"]**2)   # temporary calculation

shots[["SHOT_DISTANCE", "calc_dist"]].corr().iloc[0,1] > 0.8

True

In [16]:
# Drop `calc_dist` column
shots = shots.drop(columns=["calc_dist"])

---
## Cleaning

In [18]:
# Drop nonessential columns
shots_clean = shots.drop(columns=["GRID_TYPE", "EVENT_TYPE", "SHOT_ATTEMPTED_FLAG"])

In [19]:
# Check: data types
shots_clean.dtypes

GAME_ID              object
GAME_EVENT_ID         int64
PLAYER_ID             int64
PLAYER_NAME          object
TEAM_ID               int64
TEAM_NAME            object
PERIOD                int64
MINUTES_REMAINING     int64
SECONDS_REMAINING     int64
ACTION_TYPE          object
SHOT_TYPE            object
SHOT_ZONE_BASIC      object
SHOT_ZONE_AREA       object
SHOT_ZONE_RANGE      object
SHOT_DISTANCE         int64
LOC_X                 int64
LOC_Y                 int64
SHOT_MADE_FLAG        int64
GAME_DATE            object
HTM                  object
VTM                  object
dtype: object

In [20]:
# Convert `GAME_ID` to int64
shots_clean["GAME_ID"] = shots_clean["GAME_ID"].astype("int64")

In [21]:
# Convert `GAME_DATE` to datetime
shots_clean["GAME_DATE"] = pd.to_datetime(shots_clean["GAME_DATE"], errors="coerce")

shots_clean["GAME_DATE"].sample(5)

52868    2024-11-06
101851   2025-04-08
122899   2025-03-27
102086   2025-04-11
201621   2025-02-01
Name: GAME_DATE, dtype: datetime64[ns]

In [22]:
# Confirm: updated data types
shots_clean.dtypes

GAME_ID                       int64
GAME_EVENT_ID                 int64
PLAYER_ID                     int64
PLAYER_NAME                  object
TEAM_ID                       int64
TEAM_NAME                    object
PERIOD                        int64
MINUTES_REMAINING             int64
SECONDS_REMAINING             int64
ACTION_TYPE                  object
SHOT_TYPE                    object
SHOT_ZONE_BASIC              object
SHOT_ZONE_AREA               object
SHOT_ZONE_RANGE              object
SHOT_DISTANCE                 int64
LOC_X                         int64
LOC_Y                         int64
SHOT_MADE_FLAG                int64
GAME_DATE            datetime64[ns]
HTM                          object
VTM                          object
dtype: object

---
## Save

In [24]:
# Sort by shot order
shots_clean_sorted = shots_clean.sort_values(
    ["GAME_ID", "PLAYER_ID", "PERIOD",
     "MINUTES_REMAINING", "SECONDS_REMAINING"],
    ascending=[True, True, True, False, False]
).reset_index(drop=True)

In [25]:
# Save to parquet
shots_clean_sorted.to_parquet("nba_2024_25_shot_level_data_clean.parquet")