# Creating Player and Team Data Sources

###### By:: Trevor Rowland(dBCooper2)

To use Corsi, Fenwick and xGoal Stats, player and team data must be added from Moneypuck. To do this, I have downloaded the player and team data from each season from Moneypuck, and will load it here. After loading it, there will be a series of python functions to concatenate and clean the resultant dataframes.

In [1]:
import polars as pl
import os
import re

# Define the directory containing the CSV files
data_directory = "/Users/dB/Desktop/fall_24/DS-4210/final-proj/data/"

# Initialize lists to store DataFrames
teams_dataframes = []
skaters_dataframes = []

# Loop through files in the directory
for file_name in os.listdir(data_directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(data_directory, file_name)
        
        # Extract the year from the filename using a regex
        match = re.search(r"_(\d{4})\.csv$", file_name)
        if match:
            year = int(match.group(1))
            
            # Read the file into a DataFrame
            df = pl.read_csv(file_path)
            df = df.with_columns(pl.lit(year).alias("year"))
            
            # Categorize the DataFrame based on the filename
            if file_name.startswith("teams"):
                teams_dataframes.append(df)
            elif file_name.startswith("skaters"):
                skaters_dataframes.append(df)

Now we have 2 lists of dataframes, 1 for teams and 1 for players

In [13]:
teams_dataframes

[shape: (155, 108)
 ┌──────┬────────┬──────┬────────────────┬───┬───────────────┬───────────────┬───────────────┬──────┐
 │ team ┆ season ┆ name ┆ team_duplicate ┆ … ┆ totalShotCred ┆ scoreAdjusted ┆ scoreFlurryAd ┆ year │
 │ ---  ┆ ---    ┆ ---  ┆ d_0            ┆   ┆ itAgainst     ┆ TotalShotCred ┆ justedTotalSh ┆ ---  │
 │ str  ┆ f64    ┆ str  ┆ ---            ┆   ┆ ---           ┆ itAg…         ┆ otCr…         ┆ i32  │
 │      ┆        ┆      ┆ str            ┆   ┆ f64           ┆ ---           ┆ ---           ┆      │
 │      ┆        ┆      ┆                ┆   ┆               ┆ f64           ┆ f64           ┆      │
 ╞══════╪════════╪══════╪════════════════╪═══╪═══════════════╪═══════════════╪═══════════════╪══════╡
 │ CBJ  ┆ 2019.0 ┆ CBJ  ┆ CBJ            ┆ … ┆ 18.04         ┆ 18.04         ┆ 17.65         ┆ 2019 │
 │ CBJ  ┆ 2019.0 ┆ CBJ  ┆ CBJ            ┆ … ┆ 168.71        ┆ 169.65        ┆ 166.69        ┆ 2019 │
 │ CBJ  ┆ 2019.0 ┆ CBJ  ┆ CBJ            ┆ … ┆ 123.4         ┆ 

In [3]:
for i, df in enumerate(teams_dataframes):
    print(f"Schema for teams DataFrame {i}:")
    print(df.schema)

for i, df in enumerate(skaters_dataframes):
    print(f"Schema for skaters DataFrame {i}:")
    print(df.schema)

Schema for teams DataFrame 0:
Schema({'team': String, 'season': Int64, 'name': String, 'team_duplicated_0': String, 'position': String, 'situation': String, 'games_played': Int64, 'xGoalsPercentage': Float64, 'corsiPercentage': Float64, 'fenwickPercentage': Float64, 'iceTime': Float64, 'xOnGoalFor': Float64, 'xGoalsFor': Float64, 'xReboundsFor': Float64, 'xFreezeFor': Float64, 'xPlayStoppedFor': Float64, 'xPlayContinuedInZoneFor': Float64, 'xPlayContinuedOutsideZoneFor': Float64, 'flurryAdjustedxGoalsFor': Float64, 'scoreVenueAdjustedxGoalsFor': Float64, 'flurryScoreVenueAdjustedxGoalsFor': Float64, 'shotsOnGoalFor': Float64, 'missedShotsFor': Float64, 'blockedShotAttemptsFor': Float64, 'shotAttemptsFor': Float64, 'goalsFor': Float64, 'reboundsFor': Float64, 'reboundGoalsFor': Float64, 'freezeFor': Float64, 'playStoppedFor': Float64, 'playContinuedInZoneFor': Float64, 'playContinuedOutsideZoneFor': Float64, 'savedShotsOnGoalFor': Float64, 'savedUnblockedShotAttemptsFor': Float64, 'pena

In [4]:
# Ensure consistent column types for teams
teams_dataframes = [
    df.with_columns([df[col].cast(pl.Float64) for col, dtype in df.schema.items() if dtype == pl.Int64])
    for df in teams_dataframes
]

# Ensure consistent column types for skaters
skaters_dataframes = [
    df.with_columns([df[col].cast(pl.Float64) for col, dtype in df.schema.items() if dtype == pl.Int64])
    for df in skaters_dataframes
]


In [20]:
for i, df in enumerate(teams_dataframes):
    print(f"Columns in teams DataFrame {i}:")
    print(df.columns)

Columns in teams DataFrame 0:
['team', 'season', 'name', 'team_duplicated_0', 'position', 'situation', 'games_played', 'xGoalsPercentage', 'corsiPercentage', 'fenwickPercentage', 'iceTime', 'xOnGoalFor', 'xGoalsFor', 'xReboundsFor', 'xFreezeFor', 'xPlayStoppedFor', 'xPlayContinuedInZoneFor', 'xPlayContinuedOutsideZoneFor', 'flurryAdjustedxGoalsFor', 'scoreVenueAdjustedxGoalsFor', 'flurryScoreVenueAdjustedxGoalsFor', 'shotsOnGoalFor', 'missedShotsFor', 'blockedShotAttemptsFor', 'shotAttemptsFor', 'goalsFor', 'reboundsFor', 'reboundGoalsFor', 'freezeFor', 'playStoppedFor', 'playContinuedInZoneFor', 'playContinuedOutsideZoneFor', 'savedShotsOnGoalFor', 'savedUnblockedShotAttemptsFor', 'penaltiesFor', 'penalityMinutesFor', 'faceOffsWonFor', 'hitsFor', 'takeawaysFor', 'giveawaysFor', 'lowDangerShotsFor', 'mediumDangerShotsFor', 'highDangerShotsFor', 'lowDangerxGoalsFor', 'mediumDangerxGoalsFor', 'highDangerxGoalsFor', 'lowDangerGoalsFor', 'mediumDangerGoalsFor', 'highDangerGoalsFor', 'score

In [11]:
for i, df in enumerate(teams_dataframes):
    print(f"Columns in teams DataFrame {i}:")
    print(df.columns)

Columns in teams DataFrame 0:
['team', 'season', 'name', 'team_duplicated_0', 'position', 'situation', 'games_played', 'xGoalsPercentage', 'corsiPercentage', 'fenwickPercentage', 'iceTime', 'xOnGoalFor', 'xGoalsFor', 'xReboundsFor', 'xFreezeFor', 'xPlayStoppedFor', 'xPlayContinuedInZoneFor', 'xPlayContinuedOutsideZoneFor', 'flurryAdjustedxGoalsFor', 'scoreVenueAdjustedxGoalsFor', 'flurryScoreVenueAdjustedxGoalsFor', 'shotsOnGoalFor', 'missedShotsFor', 'blockedShotAttemptsFor', 'shotAttemptsFor', 'goalsFor', 'reboundsFor', 'reboundGoalsFor', 'freezeFor', 'playStoppedFor', 'playContinuedInZoneFor', 'playContinuedOutsideZoneFor', 'savedShotsOnGoalFor', 'savedUnblockedShotAttemptsFor', 'penaltiesFor', 'penalityMinutesFor', 'faceOffsWonFor', 'hitsFor', 'takeawaysFor', 'giveawaysFor', 'lowDangerShotsFor', 'mediumDangerShotsFor', 'highDangerShotsFor', 'lowDangerxGoalsFor', 'mediumDangerxGoalsFor', 'highDangerxGoalsFor', 'lowDangerGoalsFor', 'mediumDangerGoalsFor', 'highDangerGoalsFor', 'score

# Hop Into Excel, Dataframe 4 for teams (2012 or 2020) does not have column names

In [7]:
import polars as pl
from typing import List

def vertical_concat_dataframes(dataframes: List[pl.DataFrame]) -> pl.DataFrame:
    """
    Vertically concatenate a list of Polars DataFrames.
    
    Args:
        dataframes (List[pl.DataFrame]): List of Polars DataFrames to concatenate
    
    Returns:
        pl.DataFrame: Vertically concatenated DataFrame
    
    Raises:
        ValueError: If the input list is empty or DataFrames have incompatible schemas
    """
    if not dataframes:
        raise ValueError("Input list of DataFrames is empty")
    
    return pl.concat(dataframes, how='vertical')

In [9]:
df = vertical_concat_dataframes(skaters_dataframes)
df.write_csv("combined_skaters.csv")

In [15]:
df2 = vertical_concat_dataframes(teams_dataframes)
df2.write_csv("combined_teams.csv")

ShapeError: unable to vstack, column names don't match: "penaltiesFor" and "penalitiesFor"