In [7]:
import requests
import pandas as pd
import numpy as np

In [8]:
def clean_spo_df(df):
    df.columns = [
        "Rank",
        "Team",
        "Record",
        "Players Active",
        "Avg Age Team",
        "Total Cap Allocations",
        "Long-Term IR Adjustment",
        "Cap Space All",
        "Active",
        "Injured",
        "Injured  Long-Term",
    ]
    df = df[["Rank", "Team", "Total Cap Allocations", "Cap Space All"]]
    df_trimmed = df.iloc[:-2].copy()
    # Splitting the "Team" column based on the first occurence of a space and removing duplicate from "Team"
    df_trimmed[["Team", "Drop"]] = df_trimmed["Team"].str.split(" ", n=1, expand=True)
    # Dropping the column "Drop"
    df_trimmed = df_trimmed.drop(columns=["Drop"])

    return df_trimmed


def clean_capfr_df(df):
    df = df[["PLAYER", "TEAM", "POS", "CAP HIT", "SALARY"]]
    # NEW LINES: Splitting "PLAYER" Column into first and last names
    # Splitting the "PLAYER" column based on the occurrence of a space
    df[["prefix", "firstName", "lastName"]] = df["PLAYER"].str.split(" ", n=2, expand=True)
    # Drop the original "PLAYER" column and the "prefix" column
    df = df.drop(columns=["PLAYER", "prefix"])

    return df

In [9]:
# Creates a pandas dataframe from a website table given a dictionary of URLs
def read_url(urls):
    total_dfs = []

    for url in urls.keys():
        if urls[url] == "single":
            df = pd.read_html(url)[0]
            df = clean_spo_df(df)
            total_dfs.append(df)
        if urls[url] == "multi":
            dfs = []
            i = "1"
            df = pd.read_html(url + i)[0]
            while len(df) != 0:
                df = clean_capfr_df(df)
                dfs.append(df)
                df = pd.read_html(url + i)[0]
                i = int(i)
                i += 1
                i = str(i)
            combined_df = pd.DataFrame()
            for df in dfs:
                combined_df = pd.concat([combined_df, df], ignore_index=True)
            total_dfs.append(combined_df)

    return total_dfs

In [10]:
# Writes .csv files from dfs given a list structured as [type, year, df]
def write_csv(dfs):
    for df in dfs:
        # TESTING
        print(df[0])
        print(df[1])
        print(df[2])

        df[2].to_csv(df[0] + "_files/" + df[0] + "_" + df[1] + ".csv")
    return

In [11]:
print("Running Main...")

# Spotrac URLs for team salary totals
spo_url_15 = "https://www.spotrac.com/nhl/cap/_/year/2015/sort/cap_maximum_space2"
spo_url_16 = "https://www.spotrac.com/nhl/cap/_/year/2016/sort/cap_maximum_space2"
spo_url_17 = "https://www.spotrac.com/nhl/cap/_/year/2017/sort/cap_maximum_space2"

# Cap Friendly URLs for player salary totals
cafr_base_15 = "https://www.capfriendly.com/browse/active/2016?hide=clauses,age,handed,skater-stats,goalie-stats&pg="
cafr_base_16 = "https://www.capfriendly.com/browse/active/2017?hide=clauses,age,handed,skater-stats,goalie-stats&pg="
cafr_base_17 = "https://www.capfriendly.com/browse/active/2018?hide=clauses,age,handed,skater-stats,goalie-stats&pg="


nhl_urls = {
    spo_url_15: "single",
    spo_url_16: "single",
    spo_url_17: "single",
    cafr_base_15: "multi",
    cafr_base_16: "multi",
    cafr_base_17: "multi",
}

team_sals_15, team_sals_16, team_sals_17, player_sals_15, player_sals_16, player_sals_17 = read_url(
    nhl_urls
)

dfs = [
    ["team", "20151016", team_sals_15],
    ["team", "20162017", team_sals_16],
    ["team", "20171018", team_sals_17],
    ["player", "20151016", player_sals_15],
    ["player", "20161017", player_sals_16],
    ["player", "20171018", player_sals_17],
]

dfs = [
    ["team", "20151016", team_sals_15],
    ["team", "20162017", team_sals_16],
    ["team", "20171018", team_sals_17],
    ["player", "20151016", player_sals_15],
    ["player", "20161017", player_sals_16],
    ["player", "20171018", player_sals_17],
]

write_csv(dfs)

Running Main...
team
20151016
    Rank Team Total Cap Allocations Cap Space All
0    1.0  WPG           $57,038,400   $14,361,600
1    2.0  CAR           $57,310,738   $14,089,262
2    3.0  NSH           $61,178,945   $10,221,055
3    4.0  BUF           $62,019,704    $9,380,296
4    5.0  COL           $62,767,942    $8,632,058
5    6.0  ANA           $63,192,523    $8,207,477
6    7.0  NJD           $63,197,460    $8,202,540
7    8.0  ARI           $65,050,932    $6,349,068
8    9.0  NYI           $65,254,015    $6,145,985
9   10.0  OTT           $65,450,504    $5,949,496
10  11.0  DAL           $66,132,966    $5,267,034
11  12.0  CGY           $67,570,257    $3,829,743
12  13.0  CBJ           $67,937,548    $3,462,452
13  14.0  EDM           $68,765,462    $2,634,538
14  15.0  PHI           $69,056,693    $2,343,307
15  16.0  FLA           $69,087,745    $2,312,255
16  17.0  BOS           $69,478,695    $1,921,305
17  18.0  SJS           $69,675,193    $1,724,807
18  19.0  MIN       

In [12]:
# print(team_sals_15.head())
# print(team_sals_16.head())
# print(team_sals_17.head())
# print(player_sals_15.head())
# print(player_sals_16.head())
# print(player_sals_17.head())