# Creating and Inserting Corsi Statistics for Hockey Analytics

This notebook demonstrates how to load hockey game data, calculate Corsi statistics, and prepare the data for further analysis or insertion into a database. We will use the following scripts:
- `load_data.py` to load the necessary data from the database.
- `corsi_make_stats.py` to calculate the Corsi statistics and save them for future use.

### Author:
        Eric Winiecke

### Date:
        August 11, 2024


In [2]:
# Import necessary libraries
import os
from time import perf_counter

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import (
    BigInteger,
    Column,
    Float,
    Integer,
    MetaData,
    Table,
    create_engine,
)
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import sessionmaker

# Load environment variables
load_dotenv()


# Define functions from load_data.py for loading data
def get_env_vars():
    """Assemble credentials for database connection."""
    env_vars = {
        "DATABASE_TYPE": os.getenv("DATABASE_TYPE"),
        "DBAPI": os.getenv("DBAPI"),
        "ENDPOINT": os.getenv("ENDPOINT"),
        "USER": os.getenv("USER"),
        "PASSWORD": os.getenv("PASSWORD"),
        "PORT": int(os.getenv("PORT", 5432)),
        "DATABASE": os.getenv("DATABASE"),
    }
    return env_vars


def get_db_engine(env_vars):
    """Create connection string to database."""
    connection_string = (
        f"{env_vars['DATABASE_TYPE']}+{env_vars['DBAPI']}://"
        f"{env_vars['USER']}:{env_vars['PASSWORD']}@"
        f"{env_vars['ENDPOINT']}:{env_vars['PORT']}/"
        f"{env_vars['DATABASE']}"
    )
    return create_engine(connection_string)


def load_data(env_vars):
    """Connect to database and load data into DataFrames."""
    engine = get_db_engine(env_vars)

    queries = {
        "game_skater_stats": "SELECT * FROM game_skater_stats",
        "game_plays": "SELECT * FROM game_plays",
        "game_shifts": "SELECT * FROM game_shifts",
        "game": "SELECT * FROM game",
    }

    df = {}
    for name, query in queries.items():
        df[name] = pd.read_sql(query, engine)
        print(f"{name}:")
        display(df[name].head())  # Display first few rows for each DataFrame

    return df


# Load environment variables and data
env_vars = get_env_vars()
df_master = load_data(env_vars)
print("Data loaded successfully.")


game_skater_stats:


Unnamed: 0,game_id,player_id,team_id,timeOnIce,assists,goals,shots,hits,powerPlayGoals,powerPlayAssists,...,faceoffTaken,takeaways,giveaways,shortHandedGoals,shortHandedAssists,blocked,plusMinus,evenTimeOnIce,shortHandedTimeOnIce,powerPlayTimeOnIce
0,2016021052,8476483,24,1332,0,1,4,1.0,0,0,...,0,0.0,2.0,0,0,1.0,1,1332,0,0
1,2016021052,8475780,24,559,0,0,3,3.0,0,0,...,3,0.0,1.0,0,0,0.0,0,543,16,0
2,2016021052,8475164,24,1120,0,0,5,0.0,0,0,...,0,1.0,0.0,0,0,1.0,-1,1048,72,0
3,2016021052,8469598,24,1095,0,0,1,1.0,0,0,...,0,0.0,0.0,0,0,2.0,0,1095,0,0
4,2015020716,8476906,4,1286,0,1,4,0.0,1,0,...,0,0.0,0.0,0,0,1.0,1,1123,0,163


game_plays:


Unnamed: 0,play_id,game_id,team_id_for,team_id_against,event,secondaryType,x,y,period,periodType,periodTime,periodTimeRemaining,dateTime,goals_away,goals_home,description,st_x,st_y
0,2015030175_246,2015030175,18.0,24.0,Takeaway,,-42.0,-33.0,2,REGULAR,1123,77.0,2016-04-24 00:48:36,1,2,Takeaway by Paul Gaustad,-42.0,-33.0
1,2015030175_247,2015030175,24.0,18.0,Giveaway,,97.0,10.0,2,REGULAR,1128,72.0,2016-04-24 00:48:42,1,2,Giveaway by Frederik Andersen,-97.0,-10.0
2,2015030175_248,2015030175,18.0,24.0,Shot,Slap Shot,83.0,29.0,2,REGULAR,1132,68.0,2016-04-24 00:48:46,1,2,Paul Gaustad Slap Shot saved by Frederik Andersen,83.0,29.0
3,2015030175_249,2015030175,,,Stoppage,,0.0,0.0,2,REGULAR,1188,12.0,2016-04-24 00:49:44,1,2,Offside,,
4,2015030175_250,2015030175,18.0,24.0,Faceoff,,20.0,22.0,2,REGULAR,1188,12.0,2016-04-24 00:50:11,1,2,Ryan Johansen faceoff won against Nate Thompson,20.0,22.0


game_shifts:


Unnamed: 0,game_id,player_id,period,shift_start,shift_end
0,2018020157,8476994,3,3170,3222.0
1,2018020157,8476994,3,3336,3363.0
2,2018020157,8476994,3,3528,3592.0
3,2018020157,8477293,1,0,1200.0
4,2018020157,8477293,2,1200,2400.0


game:


Unnamed: 0,game_id,season,type,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2016020045,20162017,R,2016-10-18 19:30:00-05:00,4,16,4,7,home win REG,right,United Center,/api/v1/venues/null,America/Chicago,-5,CDT
1,2017020812,20172018,R,2018-02-06 18:00:00-06:00,24,7,4,3,away win OT,left,KeyBank Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2015020314,20152016,R,2015-11-23 19:00:00-06:00,21,52,4,1,away win REG,right,MTS Centre,/api/v1/venues/null,America/Winnipeg,-5,CDT
3,2015020849,20152016,R,2016-02-16 18:00:00-06:00,52,12,1,2,home win REG,right,PNC Arena,/api/v1/venues/null,America/New_York,-4,EDT
4,2017020586,20172018,R,2017-12-29 21:00:00-06:00,20,24,1,2,home win REG,left,Honda Center,/api/v1/venues/null,America/Los_Angeles,-7,PDT


Data loaded successfully.


## 2. Calculating Corsi Statistics

After loading the data, we will process it to calculate Corsi statistics, which are advanced metrics used in hockey analytics to measure shot attempts and puck possession. Here is a brief description of the Corsi Formulae:

    Corsi For (CF) = Shot attempts at even strength: Shots + Blocks + Missed Shots
    Corsi Against (CA) = Shot attempts against at even strength: Shots + Blocks + Misses
    Corsi  = CF-CA
    Corsi For % = CF/(CF + CA)


### `get_num_players`
This function tracks the number of players on the ice at specific times, which is crucial for assigning the correct shift stats per player. It processes the shift data to calculate when players start and stop their shifts.


In [4]:
def get_num_players(shift_df):
    shifts_melted = pd.melt(
        shift_df,
        id_vars=["game_id", "player_id"],
        value_vars=["shift_start", "shift_end"],
    ).sort_values("value", ignore_index=True)
    shifts_melted["change"] = (
        2 * (shifts_melted["variable"] == "shift_start").astype(int) - 1
    )
    shifts_melted["num_players"] = shifts_melted["change"].cumsum()
    df_num_players = shifts_melted.groupby("value")["num_players"].last().reset_index()
    return df_num_players[
        df_num_players["num_players"].shift() != df_num_players["num_players"]
    ].reset_index(drop=True)

    return df_num_players


### `get_penalty_exclude_times`
This function identifies periods during a game when there is an imbalance in the number of players on the ice, usually due to penalties. It processes the shifts and skater stats data to find these periods.


In [13]:
# def get_penalty_exclude_times(game_shifts, game_skater_stats):
#     game_shifts = pd.merge(
#         game_shifts, game_skater_stats[["player_id", "team_id"]], on="player_id"
#     )
#     if len(game_shifts) == 0:
#         print("FIRE in the HOUSE")
#         print(game_shifts)

#     team_1 = game_shifts.loc[0, "team_id"]
#     mask = game_shifts["team_id"] == team_1

#     shifts_1 = game_shifts[mask]
#     shifts_2 = game_shifts[~mask]

#     df_num_players_1 = get_num_players(shifts_1)
#     df_num_players_2 = get_num_players(shifts_2)

#     df_num_players_1 = df_num_players_1.rename(
#         columns={"value": "time", "num_players": "team_1"}
#     )
#     df_num_players_1["team_2"] = np.nan
#     df_num_players_2 = df_num_players_2.rename(
#         columns={"value": "time", "num_players": "team_2"}
#     )
#     df_num_players_2["team_1"] = np.nan

#     df_exclude = pd.concat([df_num_players_1, df_num_players_2])
#     df_exclude = df_exclude.sort_values("time", ignore_index=True)
#     df_exclude = df_exclude.ffill()

#     mask = df_exclude["time"].shift(-1) != df_exclude["time"]
#     df_exclude = df_exclude[mask]

#     diff = df_exclude["team_1"] != df_exclude["team_2"]
#     missing = (df_exclude["team_1"] < 5) | (df_exclude["team_2"] < 5)
#     df_exclude["exclude"] = diff & missing
#     df_exclude = df_exclude.reset_index(drop=True)

#     return df_exclude

import logging


def get_penalty_exclude_times(game_shifts, game_skater_stats):
    if game_shifts.empty:
        logging.warning("Warning: game_shifts is empty in get_penalty_exclude_times")
        return pd.DataFrame()  # Return an empty DataFrame if no shifts are available

    # Merge the `team_id` column from `game_skater_stats` into `game_shifts`
    game_shifts = pd.merge(
        game_shifts,
        game_skater_stats[["game_id", "player_id", "team_id"]],
        on=["game_id", "player_id"],
        how="left",
    )
    game_shifts = game_shifts.drop(columns=["team_id_y"]).rename(
        columns={"team_id_x": "team_id"}
    )

    # Divide shifts by team
    team_1 = game_shifts.iloc[0]["team_id"]
    mask = game_shifts["team_id"] == team_1
    shifts_1 = game_shifts[mask]
    shifts_2 = game_shifts[~mask]

    # Calculate the number of players on each team and proceed as before
    df_num_players_1 = get_num_players(shifts_1)
    df_num_players_2 = get_num_players(shifts_2)

    # Rename and merge the player counts for each team
    df_num_players_1 = df_num_players_1.rename(
        columns={"value": "time", "num_players": "team_1"}
    )
    df_num_players_2 = df_num_players_2.rename(
        columns={"value": "time", "num_players": "team_2"}
    )

    df_exclude = pd.concat([df_num_players_1, df_num_players_2]).sort_values(
        "time", ignore_index=True
    )
    df_exclude = df_exclude.ffill()

    mask = df_exclude["time"].shift(-1) != df_exclude["time"]
    df_exclude = df_exclude[mask]

    # Determine exclusions based on player counts
    diff = df_exclude["team_1"] != df_exclude["team_2"]
    missing = (df_exclude["team_1"] < 5) | (df_exclude["team_2"] < 5)
    df_exclude["exclude"] = diff & missing
    df_exclude = df_exclude.reset_index(drop=True)

    # Log the penalty exclude times for verification
    logging.info("Penalty Exclude Times:")
    for _, row in df_exclude.iterrows():
        logging.info(
            f"""Time: {row["time"]}, Team 1 Players: {row["team_1"]},
            Team 2 Players: {row["team_2"]}, Exclude: {row["exclude"]}"""
        )

    return df_exclude


### `organize_by_season`
This function processes the hockey data for multiple seasons and computes Corsi metrics. It filters, merges, and manipulates data for each season to prepare it for analysis.


In [14]:
def organize_by_season(seasons, df):
    df_orig = df
    nhl_dfs = []
    for season in seasons:
        print(f"Processing season: {season}")
        df = df_orig.copy()
        df["game"] = df["game"].query(f"season == {season}")

        print(f"Games for season {season}:")
        print(df["game"].head())

        for name in ["game_skater_stats", "game_plays", "game_shifts"]:
            df[name] = pd.merge(
                df[name], df["game"][["game_id"]], on="game_id"
            ).drop_duplicates()

            for key, val in df.items():
                print(f"{key:>25}: {len(val)}")

        print("game_plays before filtering events:")
        print(df["game_plays"].head())

        cols = ["play_id", "game_id", "team_id_for", "event", "time"]
        events = ["Shot", "Blocked Shot", "Missed Shot", "Goal"]
        df["game_plays"] = df["game_plays"].loc[df["game_plays"]["event"].isin(events)]
        df["game_plays"]["time"] = (
            df["game_plays"]["periodTime"] + (df["game_plays"]["period"] - 1) * 1200
        )
        df["game_plays"] = df["game_plays"][cols]

        print(f"reduced game_plays num rows: {len(df['game_plays'])}")
        print(df["game_plays"].head())

        print("game_skater_stats before merging with game_shifts:")
        print(df["game_skater_stats"].head())
        print("game_shifts before merging with game_skater_stats:")
        print(df["game_shifts"].head())

        df["game_skater_stats"] = pd.merge(
            df["game_skater_stats"], df["game_shifts"][["game_id"]], on="game_id"
        ).drop_duplicates(ignore_index=True)

        print("Merged game_skater_stats:")
        print(df["game_skater_stats"].head())

        df_corsi = df["game_skater_stats"].sort_values(
            ["game_id", "player_id"], ignore_index=True
        )[["game_id", "player_id", "team_id"]]

        print(f"df_corsi for season {season}:")
        print(df_corsi.head())

        print(f"Calling create_corsi_stats for season: {season}")
        nhl_dfs.append([season, create_corsi_stats(df_corsi, df)])
        print(f"Completed create_corsi_stats for season: {season}")

    return nhl_dfs


### `create_corsi_stats`
This function calculates Corsi statistics for individual players using a DataFrame that contains player and game information. Corsi is a key metric used to measure puck possession in hockey.


In [15]:
def create_corsi_stats(df_corsi, df):
    df_corsi[["corsi_for", "corsi_against", "corsi"]] = np.nan

    game_id_prev = None
    shifts_game, plays_game = None, None
    t1 = perf_counter()

    for i, row in df_corsi.iterrows():
        game_id, player_id, team_id = row.iloc[:3]

        if i % 1000 == 0:
            print(f"{i:>6}/{len(df_corsi)}, {perf_counter() - t1:.2f} s")

        if pd.isna(game_id):
            print(f"Skipping row with NaN game_id: {row}")
            continue

        if game_id != game_id_prev:
            game_id_prev = game_id
            shifts_game = df["game_shifts"].query(f"game_id == {game_id}")
            plays_game = df["game_plays"].query(f"game_id == {game_id}")

            gss = df["game_skater_stats"].query(f"game_id == {game_id}")
            if 0 in [len(shifts_game), len(gss)]:
                print(f"game_id: {game_id}")
                print("Empty DF before Merge.")
                continue

            df_num_players = get_penalty_exclude_times(shifts_game, gss).reset_index(
                drop=True
            )
            idx = df_num_players["time"].searchsorted(plays_game["time"]) - 1
            idx[idx < 0] = 0
            mask = df_num_players["exclude"][idx]
            mask = mask.reset_index(drop=True).to_numpy()
            plays_game = plays_game.loc[~mask]

        shifts_player = shifts_game.query(f"player_id == {player_id}")
        mask = (
            shifts_player["shift_start"].searchsorted(plays_game["time"])
            - shifts_player["shift_end"].searchsorted(plays_game["time"])
        ).astype(bool)

        plays_player = plays_game[mask]

        corsi_for = (plays_player["team_id_for"] == team_id).sum()
        corsi_against = len(plays_player) - corsi_for
        corsi = corsi_for - corsi_against
        df_corsi.iloc[i, 3:] = [corsi_for, corsi_against, corsi]

    df_corsi["CF_Percent"] = df_corsi["corsi_for"] / (
        df_corsi["corsi_for"] + df_corsi["corsi_against"]
    )

    print(df_corsi.head())

    if game_id_prev is not None:
        print(f"Processed Corsi stats for game {game_id_prev}")

    return df_corsi


### `write_csv``
This function saves the processed Corsi data to CSV files. Each file is named according to the season it corresponds to.

In [11]:
def write_csv(dfs):
    relative_directory = "corsi_stats"

    if not os.path.exists(relative_directory):
        os.makedirs(relative_directory)

    for df in dfs:
        file_path = f"{relative_directory}/corsi_{df[0]}.csv"
        df[1].to_csv(file_path, index=False)
        print(f"Written to {file_path}")


### `calculate_and_save_corsi_stats`
This high-level function orchestrates the entire process, from loading data to calculating Corsi stats and saving the results.


In [12]:
def calculate_and_save_corsi_stats():
    env_vars = get_env_vars()
    df_master = load_data(env_vars)
    print("Data loaded successfully")

    for name, df in df_master.items():
        print(f"{name}:")
        print(df.head())

    seasons = [20152016, 20162017, 20172018]
    nhl_dfs = organize_by_season(seasons, df_master)
    print("Data organized by season")

    write_csv(nhl_dfs)


## 4. Inserting Corsi Statistics into Database

Finally, we insert the saved Corsi statistics from the CSV files into the appropriate tables in the `hockey_stats` database.


In [None]:
# Define database insertion functions from August 11, 2024 code


def create_corsi_table(metadata, table_name):
    """Define table creation function to avoid repetition."""
    return Table(
        table_name,
        metadata,
        Column("game_id", BigInteger),
        Column("player_id", BigInteger),
        Column("team_id", Integer),
        Column("corsi_for", Float, nullable=True),
        Column("corsi_against", Float, nullable=True),
        Column("corsi", Float, nullable=True),
        Column("CF_Percent", Float, nullable=True),
    )


# Create tables for each season
metadata = MetaData()
tables = {
    season: create_corsi_table(metadata, f"raw_corsi_{season}") for season in seasons
}
metadata.create_all(engine)

Session = sessionmaker(bind=engine)


def insert_data_from_csv(engine, table_name, file_path):
    """Insert data from CSV into the specified database table."""
    try:
        df = pd.read_csv(file_path)
        df.to_sql(table_name, con=engine, if_exists="replace", index=False)
        print(f"Data inserted successfully into {table_name}")

        # Remove the file after successful insertion
        os.remove(file_path)
        print(f"File {file_path} deleted successfully.")

    except SQLAlchemyError as e:
        print(f"Error inserting data into {table_name}: {e}")
    except FileNotFoundError as e:
        print(f"File not found: {file_path} - {e}")
    except Exception as e:
        print(f"Error occurred while processing file '{file_path}': {e}")


# Define directories and mappings for insertion
csv_files_and_mappings = [
    ("corsi_stats/corsi_20152016.csv", "raw_corsi_20152016"),
    ("corsi_stats/corsi_20162017.csv", "raw_corsi_20162017"),
    ("corsi_stats/corsi_20172018.csv", "raw_corsi_20172018"),
]

# Insert data into database tables
with Session() as session:
    for file_path, table_name in csv_files_and_mappings:
        insert_data_from_csv(engine, table_name, file_path)

    print("Data inserted successfully into all tables.")


### Execution
We run the main function to load the data, calculate the Corsi stats, and save the results.


In [None]:
if __name__ == "__main__":
    calculate_and_save_corsi_stats()


## Conclusion


In this notebook, we successfully loaded hockey game data, calculated Corsi statistics, saved the results to CSV files, and inserted the data into the database tables. These steps ensure that the data is now ready for further analysis and reporting within the `hockey_stats` database.



