# Scraping, Inserting, and Aggregating Hockey Data

This notebook demonstrates how to scrape player data from Spotrac, insert the scraped data into a database, and then aggregate the data for further analysis.

## 1. Setup and Environment Configuration



In [None]:
# Import necessary libraries
import os
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from dotenv import load_dotenv
from sqlalchemy import (
    BigInteger,
    Column,
    Float,
    Integer,
    MetaData,
    String,
    Table,
    create_engine,
)
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import sessionmaker

# Load environment variables
load_dotenv()

# Set up the database connection
DATABASE_TYPE = os.getenv("DATABASE_TYPE")
DBAPI = os.getenv("DBAPI")
ENDPOINT = os.getenv("ENDPOINT")
USER = os.getenv("USER")
PASSWORD = os.getenv("PASSWORD")
PORT = int(os.getenv("PORT", 5432))
DATABASE = os.getenv("DATABASE")

# Create the connection string
connection_string = f"{DATABASE_TYPE}+{DBAPI}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}"
engine = create_engine(connection_string)


## 2. Scraping Player Data from Spotrac

We will scrape player cap hit data for the years 2015, 2016, and 2017 from the Spotrac website using Selenium and BeautifulSoup.
The selenium driver is set to safari browser. If you are using a different browser, you will need to adjust the code. 
driver = webdriver.Safari()

In [None]:
# Set up the Safari WebDriver
driver = webdriver.Safari()

# Base URL to scrape
BASE_URL = "https://www.spotrac.com/nhl/rankings/player/_/year/{}/sort/cap_total"

# Years to scrape
years = [2015, 2016, 2017]

# Directory to store CSV files
OUTPUT_DIR = "player_cap_hits"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Dictionary to store DataFrames for each year
dfs_by_year = {}

def split_player_name(name):
    """Function to clean and split player name"""
    name_parts = name.split()
    first_name = name_parts[0]
    last_name = " ".join(name_parts[1:]) if len(name_parts) > 1 else ""
    return first_name, last_name

# Loop through each year and scrape the data
for year in years:
    url = BASE_URL.format(year)
    driver.get(url)

    # Wait until the table is loaded
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "list-group-item"))
    )

    # Scroll to the bottom of the page to load all content (if applicable)
    while True:
        previous_height = driver.execute_script("return document.body.scrollHeight")
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(2)  # Wait for new data to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == previous_height:
            break  # Exit the loop when no more new content is loaded

    # Get page source and parse with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Find the elements containing the player names and cap hits
    first_names = []
    last_names = []
    cap_hits = []

    for item in soup.find_all("li", class_="list-group-item"):
        name_div = item.find("div", class_="link")
        cap_hit_span = item.find("span", class_="medium")
        if name_div and cap_hit_span:
            name = name_div.text.strip()
            first_name, last_name = split_player_name(name)
            cap_hit = cap_hit_span.text.strip()
            first_names.append(first_name)
            last_names.append(last_name)
            cap_hits.append(cap_hit)

    # Create a DataFrame to store the results
    df = pd.DataFrame(
        {"firstName": first_names, "lastName": last_names, "capHit": cap_hits}
    )

    # Store the DataFrame in the dictionary
    dfs_by_year[year] = df

# Close the driver
driver.quit()

# Display the DataFrames for each year
for year, df in dfs_by_year.items():
    print(f"Data for {year}:")
    display(df.head())

# Save the DataFrames to CSV files
for year, df in dfs_by_year.items():
    csv_path = os.path.join(OUTPUT_DIR, f"player_cap_hits_{year}.csv")
    df.to_csv(csv_path, index=False, mode="w")

print(f"Data saved to {OUTPUT_DIR} directory.")


## 3. Inserting Scraped Data into the Database

Next, we will insert the scraped data from the CSV files into the `hockey_stats` database.


In [None]:
# Define function to create cap hit tables
def create_caphit_table(table_name):
    """Define table creation function to avoid repetition"""
    return Table(
        table_name,
        metadata,
        Column("firstName", String(50)),
        Column("lastName", String(50)),
        Column("capHit", String(50)),
    )

# Create tables for each season
metadata = MetaData()
seasons = ["20152016", "20162017", "20172018"]
tables = {season: create_caphit_table(f"player_cap_hit_{season}") for season in seasons}
metadata.create_all(engine)

Session = sessionmaker(bind=engine)

def insert_data_from_csv(engine, table_name, file_path):
    """Insert data from CSV into the specified database table."""
    try:
        df = pd.read_csv(file_path)
        df.to_sql(table_name, con=engine, if_exists="replace", index=False)
        print(f"Data inserted successfully into {table_name}")

        # Remove the file after successful insertion
        os.remove(file_path)
        print(f"File {file_path} deleted successfully.")

    except SQLAlchemyError as e:
        print(f"Error inserting data into {table_name}: {e}")
    except FileNotFoundError as e:
        print(f"File not found: {file_path} - {e}")
    except Exception as e:
        print(f"Error occurred while processing file '{file_path}': {e}")

# Define directories and mappings for insertion
csv_files_and_mappings = [
    ("player_cap_hits/player_cap_hits_2015.csv", "player_cap_hit_20152016"),
    ("player_cap_hits/player_cap_hits_2016.csv", "player_cap_hit_20162017"),
    ("player_cap_hits/player_cap_hits_2017.csv", "player_cap_hit_20172018"),
]

# Insert data into database tables
with Session() as session:
    for file_path, table_name in csv_files_and_mappings:
        insert_data_from_csv(engine, table_name, file_path)

    print("Data inserted successfully into all tables.")


## 4. Aggregate Data and insert into the Database

Finally, we will aggregate the data per season per player and insert the results into an aggregated table in the database.


In [None]:
# Define function to get data from the database
def get_data_from_db(query):
    """Function to get data from the database."""
    with engine.connect() as connection:
        return pd.read_sql(query, connection)

def create_aggregated_table(table_name):
    """Create table schema for aggregated table."""
    metadata = MetaData()
    Table(
        table_name,
        metadata,
        Column("player_id", BigInteger, primary_key=True),
        Column("firstName", String),
        Column("lastName", String),
        Column("corsi_for", Float),
        Column("corsi_against", Float),
        Column("corsi", Float),
        Column("CF_Percent", Float),
        Column("timeOnIce", Float),
        Column("game_count", Integer),
        Column("Cap_Hit", Float),
    )
    metadata.create_all(engine)

# Loop through each season to aggregate data
for season in ["20152016", "20162017", "20172018"]:
    CORSI_QUERY = f"SELECT * FROM raw_corsi_{season}"
    df_corsi = get_data_from_db(CORSI_QUERY)
    if "Unnamed: 0" in df_corsi.columns:
        df_corsi = df_corsi.drop(columns=["Unnamed: 0"])

    GSS_TOI_QUERY = 'SELECT game_id, player_id, "timeOnIce" FROM game_skater_stats'
    df_gss_toi = get_data_from_db(GSS_TOI_QUERY)

    PLAYER_INFO_QUERY = (
        'SELECT player_id, "firstName", "lastName", "primaryPosition" FROM player_info'
    )
    df_player_info = get_data_from_db(PLAYER_INFO_QUERY)

    # Merge dataframes
    df_all = pd.merge(df_corsi, df_gss_toi, on=["game_id", "player_id"])
    df_all = pd.merge(df_all, df_player_info, on="player_id")

    # Group and aggregate player stats
    df_grouped_all = (
        df_all.groupby("player_id")
        .agg(
            {
                "firstName": "first",
                "lastName": "first",
                "corsi_for": "mean",
                "corsi_against": "mean",
                "corsi": "mean",
                "CF_Percent": "mean",
                "timeOnIce": "mean",
                "game_id": "count",
            }
        )
        .reset_index()
        .rename(columns={"game_id": "game_count"})
    )

    PLAYER_SALARY_QUERY = (
        f'SELECT "firstName", "lastName", "capHit" FROM player_cap_hit_{season}'
    )
    df_player_salary = get_data_from_db(PLAYER_SALARY_QUERY)
    print(df_player_salary.head())

    # Convert capHit from string to float
    df_player_salary["capHit"] = (
        df_player_salary["capHit"].replace(r"[\$,]", "", regex=True).astype(float)
    )

    # Merge aggregated stats with salary info
    df_grouped_all = pd.merge(
        df_grouped_all, df_player_salary, on=["firstName", "lastName"]
    )

    # Post-processing
    df_grouped_all["CF_Percent"] = (df_grouped_all["CF_Percent"].round(4) * 100).round(4)
    df_grouped_all["timeOnIce"] = df_grouped_all["timeOnIce"].round(2)

    THRESHOLD = 82 * 0.32
    df_grouped_all = df_grouped_all.query(f"game_count >= {THRESHOLD}")

    df_grouped_all["CF_Percent"] = df_grouped_all["CF_Percent"].apply(
        lambda x: np.round(x, 4)
    )
    df_grouped_all["timeOnIce"] = df_grouped_all["timeOnIce"].apply(
        lambda x: np.round(x, 2)
    )

    df_grouped_all = df_grouped_all.sort_values("CF_Percent", ascending=False)

    aggregated_table_name = f"aggregated_corsi_{season}"
    create_aggregated_table(aggregated_table_name)

    # Insert aggregated data into the new table
    df_grouped_all.to_sql(
        aggregated_table_name, con=engine, if_exists="replace", index=False
    )

    print(f"Data inserted successfully into {aggregated_table_name}")


## 5. Conclusion

In this notebook, we scraped player data from Spotrac, inserted the data into a database, and aggregated it for further analysis. These steps are crucial for generating meaningful insights into player performance and salary metrics.
