# NBA 2024-25: Utilizing Roles
## Notebook 06: Role-Based Analysis
This notebook computes the core role output and consistency metrics used in the final dashboard, including PRA Signal, All-Star Output Rate, Output per Role, and Output Consistency.

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [3]:
# Display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", 160)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")

___
## Load

In [5]:
# Load game logs data and all-star baselines
game_logs = pd.read_parquet(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\03_python_outputs\Merged_Player_Team_GameLogs_2024_25_final.parquet")
baselines = pd.read_parquet(r"C:\Users\dylan\OneDrive\Documents\Portfolio_Projects\NBA_2024_25_utilizing_roles\03_python_outputs\AS_baselines\NBA_Per_Game_2019_2024_baselines.parquet")

In [6]:
# Inspect game logs data
game_logs.sample(5)

Unnamed: 0,Player_Name,Player_ID,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FTA,TOV,REB,AST,PTS,TEAM_ABBREVIATION,TEAM_ID,TEAM_MIN,TEAM_FGA,TEAM_FTA,TEAM_TOV,USG%,PRA,Season,Age,Team,G,Pos,season_PTS,season_REB,season_AST,season_USG%,season_PRA
13818,Josh Hart,1628404,22400811,"Feb 23, 2025",NYK @ BOS,L,40,8,14,4,2,11,9,20,NYK,1610612752,240,92,11,11,19.76,40,2024-25,29,NYK,77,SG,13.6,9.6,5.9,15.3,29.1
5389,Danté Exum,203957,22400712,"Feb 04, 2025",DAL @ PHI,L,20,6,10,2,1,1,4,14,DAL,1610612742,240,85,31,12,25.77,19,2024-25,29,DAL,20,PG,8.7,1.7,2.8,19.6,13.2
2151,Bilal Coulibaly,1641731,22400341,"Dec 07, 2024",WAS vs. DEN,W,40,2,9,2,0,6,5,6,WAS,1610612764,240,93,22,17,9.91,17,2024-25,20,WAS,59,SF,12.3,5.0,3.4,17.9,20.7
6117,Deni Avdija,1630166,22400339,"Dec 06, 2024",POR vs. UTA,L,24,2,9,2,1,5,2,6,POR,1610612757,240,95,17,11,19.18,13,2024-25,24,POR,72,SF,16.9,7.3,3.9,23.2,28.1
269,Aaron Nesmith,1630174,22401119,"Apr 04, 2025",IND vs. UTA,W,20,5,7,1,1,3,0,12,IND,1610612754,240,89,38,7,17.97,15,2024-25,25,IND,45,SF,12.0,4.0,1.2,17.4,17.2


___
## 1) Filters and Cutoff Logic
#### a) Minutes played (minimum)

In [8]:
# Minutes played (per player) stats
game_logs["MIN"].describe(percentiles=[0.10, 0.20, 0.25, 0.30, 0.35, 0.40])

count   26306.00
mean       22.57
std        10.85
min         0.00
10%         6.00
20%        12.00
25%        15.00
30%        17.00
35%        19.00
40%        20.00
50%        24.00
max        53.00
Name: MIN, dtype: float64

> For a player's game to be included in this study, they must be on the court for at least 12 minutes (20th percentile of all minutes played values).
>
> This ensures that enough game activity occurs for meaningful USG% and PRA patterns to unfold.

In [10]:
# Filter for only (meaningful) game logs with 12+ minutes played
meaningful_game_logs = game_logs[game_logs["MIN"] >= 12]

#### b) Games Played (minimum)

In [12]:
# Games (per player) stats
games_per_player = meaningful_game_logs.groupby("Player_ID").size().reset_index(name="games_played")
games_per_player["games_played"].describe()

count   533.00
mean     40.32
std      26.08
min       1.00
25%      15.00
50%      42.00
75%      64.00
max      82.00
Name: games_played, dtype: float64

> A full regular season is typically 82 games. In order to qualify for this study, a player must play at least 20 games (roughly one quarter of the regular season).
>
> With this season-long filter, enough game-to-game activity occurs for consistency metrics to become meaningful.

In [14]:
# Merge into meaningful game logs
meaningful_game_logs = meaningful_game_logs.merge(
    games_per_player,
    on="Player_ID",
    how="left"
)

# Filter meaningful game logs for only players with 20+ games played
min_games = 20
meaningful_game_logs_filtered = meaningful_game_logs[meaningful_game_logs["games_played"] >= min_games].copy()

___
## 2) Import All-Star Baselines

In [16]:
# USG% baseline
AS_USG_baseline = baselines.loc[
    baselines["Type"] == "All-Star", "USG_baseline"
].values[0]

print(f"All-Star USG% baseline: {AS_USG_baseline}%")

All-Star USG% baseline: 29.3%


In [17]:
# PRA baseline
AS_PRA_baseline = baselines.loc[
    baselines["Type"] == "All-Star", "PRA_baseline"
].values[0]

print(f"All-Star PRA baseline: {AS_PRA_baseline} Points + Rebounds + Assists")

All-Star PRA baseline: 37.8 Points + Rebounds + Assists


___
## 3) Regression Analysis
#### a) Build Regression Dataset
We want to see **how PRA changes** in response to a **change in USG%**. In other words, how *elastic* is PRA?

In [19]:
# Only need USG% and PRA columns
reg = meaningful_game_logs_filtered[["USG%", "PRA"]].copy()

# Remove rows where log would break (zeroes or negatives)
reg = reg[
    (reg["USG%"] > 0) & (reg["PRA"] > 0)
].copy()

print(f"Regression dataset: {reg.shape[0]:,} rows | {reg.shape[1]} columns")

Regression dataset: 20,141 rows | 2 columns


In [20]:
# Regression dataset preview
reg.sample(10)

Unnamed: 0,USG%,PRA
2380,16.49,29
11118,16.05,13
14493,18.8,31
5505,27.08,28
1500,19.07,11
19702,19.13,24
1402,30.82,46
13688,22.74,23
14246,31.3,22
376,13.22,21


#### b) Fit the Log-Log Regression Model

In [22]:
# Log-transform both variables (USG% and PRA)
reg["log_USG"] = np.log(reg["USG%"])
reg["log_PRA"] = np.log(reg["PRA"])

In [23]:
# Regression setup
X = sm.add_constant(reg["log_USG"])  # multivariate independent variable -> multiple inputs
y = reg["log_PRA"]                   # univariate dependent variable -> single output

In [24]:
# Fit log-log model
model = sm.OLS(y, X).fit()

alpha = model.params["const"]   # intercept
beta = model.params["log_USG"]  # elasticity (how strongly PRA responds to USG%)

alpha, beta

(0.27541835726159747, 0.8984115299311776)

> Therefore, a **1.000% increase in USG%** is associated with a **0.898% increase in PRA**. It's not quite a linear 1:1 ratio, but it's close.
>
> ##### This is the elasticity.

___
## 4) Calculating PRA Signal

**PRA Signal** is a role-adjusted version of a player's **PRA**.

It rescales a player's raw (or actual) PRA to the **All-Star usage baseline**. This baseline is **29.3%** (from Step 2). Therefore, any game where a player's USG% is less than 29.3%, their raw PRA is **scaled upward** using the elasticity (from Step 3).

> For example, for a game where a player's USG% is 20% and PRA is 16:
>
> > The usage ratio is 29.3% / 20.0% = 1.465.
> >
> > The elasticity is applied: 1.465^(0.898) = 1.93
> >
> > 16 PRA * 1.93 = 30.9 (PRA Signal)
> >
> > *This will be better explained in the README.*

In [28]:
# --- Calculate PRA signal for every game ---
def scale_pra(row, AS_USG, beta):
    usg = row["USG%"]
    pra = row["PRA"]

    if pd.isna(usg) or pd.isna(pra):
        return np.nan

    if usg >= AS_USG:
        return pra
    
    if usg > 0 and pra > 0:
        scale = (AS_USG / usg) ** beta
        return pra * scale
    else:
        return np.nan

meaningful_game_logs_filtered["PRA_signal"] = meaningful_game_logs_filtered.apply(
    lambda r: scale_pra(r, AS_USG_baseline, beta),
    axis=1
)

meaningful_game_logs_filtered["PRA_signal"] = meaningful_game_logs_filtered["PRA_signal"].round(1)

In [29]:
# Calculate output per role (i.e., output per usage)
meaningful_game_logs_filtered["OPR"] = np.where(
    meaningful_game_logs_filtered["USG%"] > 0,
    meaningful_game_logs_filtered["PRA"] / meaningful_game_logs_filtered["USG%"],
    np.nan
)

In [30]:
# Preview results
meaningful_game_logs_filtered[["Player_Name", "GAME_DATE", "USG%", "PRA", "PRA_signal", "OPR"]].sample(10)

Unnamed: 0,Player_Name,GAME_DATE,USG%,PRA,PRA_signal,OPR
13727,Lauri Markkanen,"Jan 27, 2025",22.71,29,36.5,1.28
4745,De'Andre Hunter,"Feb 12, 2025",20.43,23,31.8,1.13
18489,Scotty Pippen Jr.,"Mar 01, 2025",21.19,23,30.8,1.09
1871,Blake Wesley,"Dec 13, 2024",12.09,18,39.9,1.49
10356,Jimmy Butler III,"Mar 04, 2025",21.27,27,36.0,1.27
11326,Jrue Holiday,"Nov 22, 2024",17.17,22,35.6,1.28
1449,Austin Reaves,"Apr 04, 2025",22.96,40,49.8,1.74
6955,Giannis Antetokounmpo,"Oct 27, 2024",27.32,41,43.7,1.5
5984,Dorian Finney-Smith,"Feb 10, 2025",7.98,9,29.0,1.13
13994,Luka Dončić,"Mar 24, 2025",37.7,46,46.0,1.22


___
## 5) Flag Projected All-Star Game Logs

In [32]:
# Flag projected all-star-level games
meaningful_game_logs_filtered["is_AS_level"] = (meaningful_game_logs_filtered["PRA_signal"] >= AS_PRA_baseline).astype(int)

In [33]:
# Preview results
meaningful_game_logs_filtered[["Player_Name", "TEAM_ABBREVIATION", "GAME_DATE", "USG%", "PRA", "PRA_signal", "is_AS_level"]].sample(5)

Unnamed: 0,Player_Name,TEAM_ABBREVIATION,GAME_DATE,USG%,PRA,PRA_signal,is_AS_level
17813,Rudy Gobert,MIN,"Mar 14, 2025",12.36,25,54.3,1
14199,Malik Beasley,DET,"Nov 27, 2024",12.41,15,32.4,0
6827,Gary Trent Jr.,MIL,"Mar 28, 2025",15.63,15,26.4,0
3220,Chet Holmgren,OKC,"Feb 23, 2025",19.08,24,35.3,0
9747,Jay Huff,MEM,"Apr 13, 2025",22.2,30,38.5,1


> Any game where the player's `PRA_signal` is greater than or equal to **37.8 (the all-star threshold)** is flagged as an **all-star level** game.

___
## 6) Player-Level Season Metrics

In [36]:
# Roll up projected metrics to player-level
player_season_projections = (
    meaningful_game_logs_filtered.groupby("Player_ID").agg(
        # Game-level aggregations
        games_played=("GAME_ID", "count"),
        AS_level_games=("is_AS_level", "sum"),
        avg_pra_signal=("PRA_signal", "mean"),
        avg_opr=("OPR", "mean"),
        # Season-level context
        season_USG=("season_USG%", "first"),
        season_PRA=("season_PRA", "first"),
        age=("Age", "first"),
        team=("Team", "first"),
        pos=("Pos", "first"),
        season_PTS=("season_PTS", "first"),
        season_REB=("season_REB", "first"),
        season_AST=("season_AST", "first")
    ).reset_index()
)

In [37]:
# Find percentage of games that each player reached the all-star threshold (i.e., the all-star output rate)
player_season_projections["AS_output_rate"] = player_season_projections["AS_level_games"] / player_season_projections["games_played"]

In [38]:
# Merge Player_Name column into this new DataFrame
player_season_projections = player_season_projections.merge(
    meaningful_game_logs_filtered[["Player_ID", "Player_Name"]].drop_duplicates(),
    on="Player_ID",
    how="left"
)

In [39]:
# Reorder columns so that `Player_Name` column is first
cols = ["Player_Name"] + [c for c in player_season_projections.columns if c != "Player_Name"]
player_season_projections = player_season_projections[cols]

In [40]:
# Usage % (season-level) stats
meaningful_game_logs_filtered["season_USG%"].describe(percentiles=[0.10, 0.20, 0.25, 0.75, 0.80, 0.90])

count   20196.00
mean       19.47
std         5.55
min         7.60
10%        13.00
20%        14.70
25%        15.40
50%        18.50
75%        23.30
80%        24.10
90%        27.70
max        35.90
Name: season_USG%, dtype: float64

In [41]:
# Create player-level usage cohorts
def categorize_usage(usg):
    if usg <= 15.0:
        return "Low Usage"
    elif usg < 23.0:
        return "Medium Usage"
    else:
        return "High Usage"

player_season_projections["USG_cohort"] = player_season_projections["season_USG"].apply(categorize_usage)

In [42]:
# Calculate output consistency for each player
player_season_projections["OC"] = np.sqrt(
    player_season_projections["AS_output_rate"] * player_season_projections["avg_opr"]
)

> A player's **output consistency** (the primary metric) is simply a mathematical combination of their **all-star output rate** and **output per role** (secondary metrics).

In [44]:
# Preview results
player_season_projections.sort_values("OC", ascending=False).head(10)

Unnamed: 0,Player_Name,Player_ID,games_played,AS_level_games,avg_pra_signal,avg_opr,season_USG,season_PRA,age,team,pos,season_PTS,season_REB,season_AST,AS_output_rate,USG_cohort,OC
59,Nikola Jokić,203999,70,67,57.47,1.84,29.5,52.5,29,DEN,C,29.6,12.7,10.2,0.96,High Usage,1.33
80,Domantas Sabonis,1627734,69,64,53.74,1.9,21.6,39.0,28,SAC,C,19.1,13.9,6.0,0.93,Medium Usage,1.33
38,Rudy Gobert,203497,72,61,53.34,2.0,13.0,24.7,32,MIN,C,12.0,10.9,1.8,0.85,Low Usage,1.3
116,Josh Hart,1628404,77,64,54.44,2.0,15.3,29.1,29,NYK,SG,13.6,9.6,5.9,0.83,Medium Usage,1.29
288,Walker Kessler,1631117,58,45,51.48,1.92,13.7,25.0,23,UTA,C,11.1,12.2,1.7,0.78,Low Usage,1.22
41,Giannis Antetokounmpo,203507,67,63,49.49,1.41,35.2,48.8,30,MIL,PF,30.4,11.9,6.5,0.94,High Usage,1.15
95,Ivica Zubac,1627826,80,61,46.99,1.68,19.5,32.1,27,LAC,C,16.8,12.6,2.7,0.76,Medium Usage,1.13
239,Jalen Johnson,1630552,35,28,45.07,1.58,22.5,33.9,23,ATL,SF,18.9,10.0,5.0,0.8,Medium Usage,1.12
279,Jalen Duren,1631105,75,54,44.5,1.62,16.4,24.8,21,DET,C,11.8,10.3,2.7,0.72,Medium Usage,1.08
65,Karl-Anthony Towns,1626157,72,55,44.98,1.49,27.4,40.3,29,NYK,C,24.4,12.8,3.1,0.76,High Usage,1.07


In [45]:
# --- Build value streaks for every player ---
def streak_lengths(flags):
    streaks = []
    current = 0

    for f in flags:
        if f == 1:
            current += 1
        else:
            if current > 0:
                streaks.append(current)
                current = 0

    # end of season
    if current > 0:
        streaks.append(current)

    return streaks

#### b) Compute streak stats

In [47]:
# --- Compute streak stats for every player ---
streak_rows = []

for pid, group in meaningful_game_logs_filtered.groupby("Player_ID"):

    group_sorted = group.sort_values("GAME_DATE")
    flags = group_sorted["is_AS_level"].tolist()
    streaks = streak_lengths(flags)

    streak_rows.append({
        "Player_ID": pid,
        "games_played": len(group_sorted),
        "projected_value_streak_count": len(streaks),
        "projected_max_value_streak": max(streaks) if streaks else 0,
        "projected_avg_value_streak": np.mean(streaks) if streaks else 0
    })

projected_streaks = pd.DataFrame(streak_rows)

In [48]:
# Merge Player_Name column into this new DataFrame
projected_streaks = projected_streaks.merge(
    meaningful_game_logs_filtered[["Player_ID", "Player_Name"]].drop_duplicates(),
    on="Player_ID",
    how="left"
)

In [49]:
# Reorder columns so that `Player_Name` column is first
cols = ["Player_Name"] + [c for c in projected_streaks.columns if c != "Player_Name"]
projected_streaks = projected_streaks[cols]

In [50]:
# Merge `USG_cohort` into DataFrame
projected_streaks = projected_streaks.merge(
    player_season_projections[["Player_ID", "USG_cohort"]],
    on="Player_ID",
    how="left"
)

In [51]:
# Preview results
projected_streaks.sort_values("projected_avg_value_streak", ascending=False).head(10)

Unnamed: 0,Player_Name,Player_ID,games_played,projected_value_streak_count,projected_max_value_streak,projected_avg_value_streak,USG_cohort
59,Nikola Jokić,203999,70,4,30,16.75,High Usage
41,Giannis Antetokounmpo,203507,67,4,27,15.75,High Usage
80,Domantas Sabonis,1627734,69,5,20,12.8,Medium Usage
152,Luka Dončić,1629029,50,6,26,6.67,High Usage
38,Rudy Gobert,203497,72,10,16,6.1,Low Usage
239,Jalen Johnson,1630552,35,5,12,5.6,Medium Usage
288,Walker Kessler,1631117,58,9,17,5.0,Low Usage
116,Josh Hart,1628404,77,14,15,4.57,Medium Usage
132,Shai Gilgeous-Alexander,1628983,76,14,26,4.43,High Usage
95,Ivica Zubac,1627826,80,14,12,4.36,Medium Usage


In [52]:
# Cohort-level streak summary
cohort_streaks = (
    projected_streaks.groupby("USG_cohort")["projected_avg_value_streak"].mean().reset_index().rename(columns={"projected_avg_value_streak": "avg_streak_length"})
)

In [53]:
# View results
cohort_streaks.sort_values("avg_streak_length", ascending=False)

Unnamed: 0,USG_cohort,avg_streak_length
0,High Usage,2.17
1,Low Usage,1.5
2,Medium Usage,1.5


___
## Save

In [55]:
# Save to CSV
meaningful_game_logs_filtered.to_csv("NBA_2024_25_game_logs_final.csv", index=False)
player_season_projections.to_csv("player_season_projections.csv", index=False)