---
draft: true 
date: 2025-01-08
description: "Calculating adjusted xG coefficients"
slug: adjusted-xg
---

# **Calculating score- and venue-adjusted xG coefficients**

---

## **Intro**

Intro language here

---

## **Housekeeping**

### Import dependencies

Import the dependencies we'll need to simulate today's games

In [None]:
import datetime as dt
import math
import pickle
from pathlib import Path
from typing import Optional

import matplotlib as mpl
import matplotlib.patches as patches
import matplotlib.patheffects as mpe
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from hockey_rink import NHLRink
from matplotlib.lines import Line2D
from rich.progress import track
from scipy.stats import poisson

import chickenstats
import chickenstats.utilities
from chickenstats.api import ChickenStats
from chickenstats.chicken_nhl import Scraper, Season
from chickenstats.chicken_nhl.scrape import Game
from chickenstats.utilities import ChickenProgress

### Pandas options

Set different pandas options

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

### Load environment variables

Load environment variables to access the chickenstats API

In [None]:
env_file = Path("../../.env")
load_dotenv(env_file)

### Chickenstats matplotlib style

chickenstats.utilities includes a custom style package - this activates it

In [None]:
plt.style.use("chickenstats")

---

## **Load data**

In [None]:
concat_list = []

years = list(range(2023, 2009, -1))

for year in track(years):
    file_path = Path(f"../../../../chickenstats-api/data/chickenstats/raw/pbp/pbp_{year}.csv")
    raw_df = pd.read_csv(file_path, low_memory=False)

    events = ["GOAL", "SHOT", "MISS", "BLOCK"]
    raw_df = raw_df.loc[raw_df.event.isin(events)]

    concat_list.append(raw_df)

In [None]:
shot_events_df = pd.concat(concat_list, ignore_index=True)

In [None]:
def prepare_shots_data(play_by_play: pd.DataFrame, strength_state: str) -> pd.DataFrame:
    """Docstring."""
    condition = play_by_play.strength_state == strength_state
    shots_data = play_by_play.loc[condition].reset_index(drop=True)

    shots_data["strength_state2"] = strength_state

    condition = shots_data.block == 1
    shots_data.event_team = np.where(condition, shots_data.opp_team, shots_data.event_team)

    shots_data.block = shots_data.block + shots_data.teammate_block

    conditions = [shots_data.home_score_diff < -3, shots_data.home_score_diff > 3]
    values = [-3, 3]
    shots_data.home_score_diff = np.select(conditions, values, shots_data.home_score_diff)

    shots_data.is_home = np.where(shots_data.event_team == shots_data.home_team, 1, 0)

    group_list = ["strength_state2", "home_score_diff", "is_home"]
    agg_stats = {x: "sum" for x in ["goal", "pred_goal", "shot", "miss", "block"]}
    shots_data = shots_data.groupby(group_list, as_index=False).agg(agg_stats)

    shots_data["fenwick"] = shots_data[["goal", "shot", "miss"]].sum(axis=1)
    shots_data["corsi"] = shots_data[["goal", "shot", "miss", "block"]].sum(axis=1)

    agg_stats = {}

    for stat in ["goal", "pred_goal", "shot", "miss", "block", "fenwick", "corsi"]:
        shots_data[f"home_{stat}"] = np.where(shots_data.is_home == 1, shots_data[stat], 0)
        shots_data[f"away_{stat}"] = np.where(shots_data.is_home == 0, shots_data[stat], 0)

        agg_stats.update({f"home_{stat}": "sum", f"away_{stat}": "sum"})

    shots_data = (
        shots_data.groupby(["strength_state2", "home_score_diff"], as_index=False)
        .agg(agg_stats)
        .rename(columns={"strength_state2": "strength_state"})
    )

    concat_list = [shots_data]

    for stat in ["goal", "pred_goal", "shot", "miss", "block", "fenwick", "corsi"]:
        series_name = f"home_{stat}_percent"
        home_stat_percent = shots_data[f"home_{stat}"] / (shots_data[f"home_{stat}"] + shots_data[f"away_{stat}"])
        concat_list.append(pd.Series(data=home_stat_percent, index=shots_data.index, name=series_name))

    shots_data = pd.concat(concat_list, axis=1)

    return shots_data

In [None]:
def calculate_weights(shots_data: pd.DataFrame) -> pd.DataFrame:
    """Docstring."""
    shots_data = shots_data.copy()

    strength_state = shots_data.strength_state.unique()[0]

    if strength_state in ["5v5", "4v4"]:
        weight_columns = ["fenwick", "corsi"]

        if strength_state == "5v5":
            weight_columns.insert(0, "shot")

        for weight_column in weight_columns:
            mean_values = shots_data[[f"home_{weight_column}", f"away_{weight_column}"]].mean(axis=1)

            shots_data[f"home_{weight_column}_weight"] = mean_values / shots_data[f"home_{weight_column}"]
            shots_data[f"away_{weight_column}_weight"] = mean_values / shots_data[f"away_{weight_column}"]

    if strength_state in ["5v5", "4v4", "5v4", "5v3", "4v3", "1v0"]:
        if strength_state == "5v5":
            weight_columns = ["goal", "pred_goal"]

        if strength_state == "4v4":
            weight_columns = ["shot"]

        if strength_state in ["5v4", "5v3", "4v3"]:
            weight_columns = ["goal", "pred_goal", "shot", "fenwick", "corsi"]

        if strength_state == "1v0":
            weight_columns = ["goal"]

        weight_groups = ["trailing", "tied", "leading"]
        weight_group_conditions = [
            shots_data.home_score_diff < 0,
            shots_data.home_score_diff == 0,
            shots_data.home_score_diff > 0,
        ]

        shots_data["score_diff_group"] = np.select(weight_group_conditions, weight_groups, default="")

        for weight_column in weight_columns:
            group_mean = (
                shots_data.groupby("score_diff_group")[[f"home_{weight_column}", f"away_{weight_column}"]]
                .transform("sum")[[f"home_{weight_column}", f"away_{weight_column}"]]
                .sum(axis=1)
            ) / 2

            group_home = shots_data.groupby("score_diff_group")[f"home_{weight_column}"].transform("sum")

            group_away = shots_data.groupby("score_diff_group")[f"away_{weight_column}"].transform("sum")

            shots_data[f"home_{weight_column}_weight"] = group_mean / group_home
            shots_data[f"away_{weight_column}_weight"] = group_mean / group_away

    if strength_state in ["4v4", "3v3"]:
        if strength_state == "4v4":
            weight_columns = ["goal", "pred_goal"]

        if strength_state == "3v3":
            weight_columns = ["goal", "pred_goal", "shot", "fenwick", "corsi"]

        for weight_column in weight_columns:
            group_mean = (
                shots_data.groupby("strength_state")[[f"home_{weight_column}", f"away_{weight_column}"]]
                .transform("sum")[[f"home_{weight_column}", f"away_{weight_column}"]]
                .sum(axis=1)
            ) / 2

            group_home = shots_data.groupby("strength_state")[f"home_{weight_column}"].transform("sum")

            group_away = shots_data.groupby("strength_state")[f"away_{weight_column}"].transform("sum")

            shots_data[f"home_{weight_column}_weight"] = group_mean / group_home
            shots_data[f"away_{weight_column}_weight"] = group_mean / group_away

    return shots_data

In [None]:
concat_list = []

strength_states = ["5v5", "4v4", "3v3", "5v4", "5v3", "4v3", "1v0"]

for strength_state in strength_states:
    shots_data = prepare_shots_data(play_by_play=shot_events_df, strength_state=strength_state)

    weights = calculate_weights(shots_data=shots_data)

    concat_list.append(weights)

In [None]:
columns = [
    "strength_state",
    "score_diff_group",
    "home_score_diff",
    "home_goal_weight",
    "away_goal_weight",
    "home_goal",
    "away_goal",
    "home_goal_percent",
    "home_pred_goal_weight",
    "away_pred_goal_weight",
    "home_pred_goal",
    "away_pred_goal",
    "home_pred_goal_percent",
    "home_shot_weight",
    "away_shot_weight",
    "home_shot",
    "away_shot",
    "home_shot_percent",
    "home_miss",
    "away_miss",
    "home_miss_percent",
    "home_block",
    "away_block",
    "home_block_percent",
    "home_fenwick_weight",
    "away_fenwick_weight",
    "home_fenwick",
    "away_fenwick",
    "home_fenwick_percent",
    "home_corsi_weight",
    "away_corsi_weight",
    "home_corsi",
    "away_corsi",
    "home_corsi_percent",
]

test = pd.concat(concat_list, ignore_index=True)[columns]

In [None]:
test

In [None]:
weights_dict = {}

strength_states = test.strength_state.unique().tolist()

for strength_state in strength_states:
    score_states = test.loc[test.strength_state == strength_state].home_score_diff.unique().tolist()

    score_state_dict = {}

    for score_state in score_states:
        conditions = np.logical_and(test.strength_state == strength_state, test.home_score_diff == score_state)
        data = test.loc[conditions].iloc[0]

        columns = [
            "score_diff_group",
            "home_score_diff",
            "home_goal_weight",
            "away_goal_weight",
            "home_goal",
            "away_goal",
            "home_goal_percent",
            "home_pred_goal_weight",
            "away_pred_goal_weight",
            "home_pred_goal",
            "away_pred_goal",
            "home_pred_goal_percent",
            "home_shot_weight",
            "away_shot_weight",
            "home_shot",
            "away_shot",
            "home_shot_percent",
            "home_miss",
            "away_miss",
            "home_miss_percent",
            "home_block",
            "away_block",
            "home_block_percent",
            "home_fenwick_weight",
            "away_fenwick_weight",
            "home_fenwick",
            "away_fenwick",
            "home_fenwick_percent",
            "home_corsi_weight",
            "away_corsi_weight",
            "home_corsi",
            "away_corsi",
            "home_corsi_percent",
        ]

        score_state_dict.update({score_state: {x: data[x] for x in columns}})

    strength_state_dict = {strength_state: score_state_dict}

    weights_dict.update(strength_state_dict)

In [None]:
weights_dict

In [None]:
strength_states = test.strength_state.unique().tolist()

In [None]:
strength_state = strength_states[0]

test.loc[test.strength_state == strength_state]

In [None]:
def process_plot_data(data: pd.DataFrame, strength_state: str) -> pd.DataFrame:
    """Docstring."""
    condition = data.strength_state == strength_state

    df = data.loc[condition].reset_index(drop=True)

    index_columns = ["strength_state", "score_diff_group", "home_score_diff"]

    home_columns = [
        x for x in df.columns if x in index_columns or ("home" in x and "percent" not in x and "weight" not in x)
    ]
    away_columns = [
        x for x in df.columns if x in index_columns or ("away" in x and "percent" not in x and "weight" not in x)
    ]

    home_data = df[home_columns].copy()
    home_data["is_home"] = 1
    home_data = home_data.rename(columns={x: x.replace("home_", "") for x in home_columns if x != "home_score_diff"})

    away_data = df[away_columns].copy()
    away_data["is_home"] = 0
    away_data = away_data.rename(columns={x: x.replace("away_", "") for x in away_columns})

    df = pd.concat([home_data, away_data], ignore_index=True).sort_values(
        by=["home_score_diff", "is_home"], ascending=[True, False]
    )

    return df

In [None]:
plot_data = process_plot_data(data=test, strength_state="5v5")

In [None]:
plot_data

In [None]:
test