In [17]:
import pandas as pd

inj = pd.read_csv("../data_raw/injuries_2022.csv")
inj.shape

(5682, 16)

In [18]:
inj["concussion_week"] = (
    inj["report_primary_injury"].str.contains("concussion", case=False, na=False)
    | inj["report_secondary_injury"].str.contains("concussion", case=False, na=False)
).astype(int)

inj["concussion_week"].value_counts()

concussion_week
0    5511
1     171
Name: count, dtype: int64

In [19]:
# sort so "previous week" is well-defined
inj = inj.sort_values(["gsis_id", "season", "week"])

# compute onset: concussion this week AND not last week
inj["concussion_onset_week"] = (
    (inj["concussion_week"] == 1)
    & (inj.groupby("gsis_id")["concussion_week"].shift(1).fillna(0) == 0)
).astype(int)

inj["concussion_onset_week"].value_counts()

concussion_onset_week
0    5560
1     122
Name: count, dtype: int64

In [20]:
import pandas as pd

pg = pd.read_parquet("../data_processed/player_game_2022.parquet")

pg.shape, pg.columns

((26381, 15),
 Index(['game_id', 'player_id', 'snaps', 'st_snaps', 'season', 'week',
        'full_name', 'team', 'position', 'status', 'run_snaps',
        'goal_to_go_snaps', 'short_yardage_snaps', 'red_zone_snaps',
        'penalty_contact_snaps'],
       dtype='object'))

In [21]:
# take only onset weeks
onsets = inj.loc[inj["concussion_onset_week"] == 1, 
                 ["gsis_id", "season", "week"]].copy()

# map onset to prior game week
onsets["event_week"] = onsets["week"] - 1

# rename for merge compatibility
onsets = onsets.rename(columns={"gsis_id": "player_id"})

# merge onto player-game table
pg = pg.merge(
    onsets[["player_id", "season", "event_week"]],
    left_on=["player_id", "season", "week"],
    right_on=["player_id", "season", "event_week"],
    how="left",
    indicator=True
)

# define game-level concussion event
pg["concussion_event"] = (pg["_merge"] == "both").astype(int)

# clean up merge artifacts
pg = pg.drop(columns=["event_week", "_merge"])

pg["concussion_event"].value_counts()

concussion_event
0    26381
Name: count, dtype: int64

In [None]:
# Keep only columns needed for mapping, and de-duplicate
pg_key = pg[["player_id", "season", "week", "team", "position", "full_name"]].drop_duplicates()
inj_key = inj[["gsis_id", "season", "week", "team", "position", "full_name"]].drop_duplicates()

# Inner join to create mapping rows where we can match identities
map_df = pg_key.merge(
    inj_key,
    on=["season", "week", "team", "position", "full_name"],
    how="inner"
)

# How many unique player_id values got a gsis_id??
map_df["player_id"].nunique(), map_df["gsis_id"].nunique(), map_df.shape

(573, 573, (1587, 7))

In [None]:
# collapse to a stable 1:1 mapping (player_id -> gsis_id)
player_map = (
    map_df[["player_id", "gsis_id"]]
    .drop_duplicates()
)

# ensure it's truly 1:1
assert player_map["player_id"].is_unique
assert player_map["gsis_id"].is_unique

# attach gsis_id onto the player-game table
pg2 = pg.merge(player_map, on="player_id", how="left")

# build onset events keyed by gsis_id, shifted back one week
onsets = inj.loc[inj["concussion_onset_week"] == 1, ["gsis_id", "season", "week"]].copy()
onsets["event_week"] = onsets["week"] - 1

# merge to mark events
pg2 = pg2.merge(
    onsets[["gsis_id", "season", "event_week"]],
    left_on=["gsis_id", "season", "week"],
    right_on=["gsis_id", "season", "event_week"],
    how="left",
    indicator=True
)

pg2["concussion_event"] = (pg2["_merge"] == "both").astype(int)

# clean
pg2 = pg2.drop(columns=["event_week", "_merge"])

pg2["concussion_event"].value_counts()

concussion_event
0    26332
1       49
Name: count, dtype: int64

In [24]:
# overall class balance (player-game level)
pg2["concussion_event"].mean(), pg2["concussion_event"].sum()

(np.float64(0.001857397369318828), np.int64(49))

In [25]:
# per-position incidence (min 200 player-games to avoid tiny denominators)
pos_rates = (
    pg2.groupby("position")["concussion_event"]
       .agg(games="count", concussions="sum", rate="mean")
       .reset_index()
       .query("games >= 200")
       .sort_values("rate", ascending=False)
)

pos_rates.head(15)

Unnamed: 0,position,games,concussions,rate
7,QB,685,8,0.011679
10,WR,2874,19,0.006611
9,TE,1761,8,0.004543
3,LB,3974,12,0.00302
8,RB,2044,2,0.000978
0,DB,5147,0,0.0
1,DL,3919,0,0.0
2,K,558,0,0.0
6,P,580,0,0.0
4,LS,568,0,0.0


In [27]:
# Baseline A: position-only probability
pos_rate_map = (
    pg2.groupby("position")["concussion_event"]
       .mean()
       .to_dict()
)

pg2["baseline_pos_prob"] = pg2["position"].map(pos_rate_map)

pg2[["position", "baseline_pos_prob"]].head()

Unnamed: 0,position,baseline_pos_prob
0,QB,0.011679
1,DL,0.0
2,LB,0.00302
3,LB,0.00302
4,OL,0.0


In [28]:
threshold = pg2["concussion_event"].mean()

pg2["baseline_pos_pred"] = (pg2["baseline_pos_prob"] >= threshold).astype(int)

confusion = pd.crosstab(
    pg2["concussion_event"],
    pg2["baseline_pos_pred"],
    rownames=["Actual"],
    colnames=["Predicted"]
)

confusion

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,17085,9247
1,2,47


In [30]:
# Baseline B: snap-only model
# probability proportional to snaps (normalized)

pg2["baseline_snap_prob"] = pg2["snaps"] / pg2["snaps"].max()

pg2[["snaps", "baseline_snap_prob"]].head()

Unnamed: 0,snaps,baseline_snap_prob
0,84,0.756757
1,49,0.441441
2,54,0.486486
3,32,0.288288
4,60,0.540541


In [31]:
threshold = pg2["concussion_event"].mean()

pg2["baseline_snap_pred"] = (pg2["baseline_snap_prob"] >= threshold).astype(int)

confusion_snap = pd.crosstab(
    pg2["concussion_event"],
    pg2["baseline_snap_pred"],
    rownames=["Actual"],
    colnames=["Predicted"]
)

confusion_snap

Predicted,1
Actual,Unnamed: 1_level_1
0,26332
1,49
