In [1]:
# pip install --upgrade s3fs fsspec

In [2]:
import pandas as pd
import s3fs
import fsspec

# df = pd.read_csv('s3://gang-green-hockey/OaklandHockeyData.csv')

In [3]:
# ### Oakland Gang Green Hockey All Time Stats
# Chris McAllister
 
# This script gathers all the Oakland Hockey Stats for all 4 GG teams, for every season that which the website has data.
 
# Oultine:
# 1) Import libraries
# 2) Establish mapping of GG Team ID's in URL to Team Names (Gang Green 1, 2, etc.)
# 3/4) Get a base dataset of all the players who played for Gang Green in our season dim range
# 3/4) Read in a CSV that converts SeasonIDs to the Season Name
# 5) Light data manipulation. Removing columns, create Points per Game metric, etc.

# 1) Import libraries
import os
import pandas as pd
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

# Ignore any warning messages
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from warnings import filterwarnings
filterwarnings('ignore')


# 3/4) Get a base dataset of all the players who played for Gang Green in our season dim range
# This needs to be run occasionally, when a new season starts. Upload it to S3 when done.
def initial_web_data(season_dim_csv):

    """
    This function takes in one arguments:

    season_dim_csv (csv): A csv that acts as a dimensional table for all the seasons we want to pull in.
                          One row is one season. Key is the ID of that season according to the website
                          Other attributes includes the year, season name (Fall 2021, eg.), etc.

    returns a dataframe that is at the player-season grain of all GG players that played in a season in our season_dim_csv file. 
    
    """

    print('Running For every season and division')
    
    # Establish empty df  of our columns
    df_main = pd.DataFrame(columns = ['Name', '#', 'Team', 'GP', 'Goals', 'Ass.', 'Hat', 'Min', 'Pts/Game', 'Pts'])

    # May need to change this if the league adds a new division
    division_dict = {'99': 'D1',
                     '210': 'D3',
                     '104': 'D5', 
                     '98' : 'D6',
                     '138': 'D7',
                     '198': 'D8',
                     '211': 'D9'}

    for index, season_id in enumerate(season_dim_csv['SeasonID']):
        for league_id in list(division_dict.keys()):

            url = 'https://stats.sharksice.timetoscore.com/display-league-stats?stat_class=1&league=27&season=' + str(season_id) + '&level=' + str(league_id) + '&conf=0'

            # There are gaps in season IDs (for example there's no season #34). 
            # This would cause an error when reading the URL so we need to handle that with the try / except code block below. 
            try:
                df = pd.read_html(url)
                df[0].columns = df[0].columns.droplevel()
                df[0]['SeasonID'] = int(season_id)
                df[0]['division'] = str(division_dict[league_id])

                df_main = pd.concat([df_main, df[0]])

            except:
                print('Season ID: ' + str(season_id) + ' Division ID: ' + str(league_id) + ' does not exist. Skipping...')
                print(url)

        if index % 3 == 0:
            print(f'Processed {index} seasons so far')

    return df_main

In [4]:
# 4) Read in a CSV that converts SeasonIDs to the Season Name
season_dim = pd.read_csv('Input_data/OaklandHockeySeasonDim.csv')
# all_players = initial_web_data(season_dim)
# all_players.to_csv('Output_data/ALL_OaklandHockeyData.csv', index=False)

In [5]:
def update_current_season(s3_path = 's3://gang-green-hockey/ALL_OaklandHockeyData.csv'):
    """
    This function updates the dataset in S3 with the latest data every day.
    """

    df = pd.read_csv(s3_path)

    max_season = df['SeasonID'].max()

    df_drop_curr_season = df[df['SeasonID'] != max_season]

    # May need to change this if the league adds a new division
    division_dict = {'99': 'D1',
                     '210': 'D3',
                     '104': 'D5', 
                     '98' : 'D6',
                     '138': 'D7',
                     '198': 'D8',
                     '211': 'D9'}

    for league_id in list(division_dict.keys()):

        url = 'https://stats.sharksice.timetoscore.com/display-league-stats?stat_class=1&league=27&season=' + str(int(max_season)) + '&level=' + str(league_id) + '&conf=0'

        # There are gaps in season IDs (for example there's no season #34). 
        # This would cause an error when reading the URL so we need to handle that with the try / except code block below. 
        try:
            df_curr = pd.read_html(url)
            df_curr[0].columns = df_curr[0].columns.droplevel()
            df_curr[0]['SeasonID'] = int(max_season)
            df_curr[0]['division'] = str(division_dict[league_id])

            df_drop_curr_season = pd.concat([df_drop_curr_season, df_curr[0]])

        except:
            print('Division ID: ' + str(league_id) + ' does not exist. Skipping...')
            print(url)
    
    return df_drop_curr_season

data = update_current_season()

In [6]:
data.head()

Unnamed: 0,Name,#,Team,GP,Goals,Ass.,Hat,Min,Pts/Game,Pts,SeasonID,division
0,Tim Kelly,16,Skateful Dead,14,15,20,1,2,2.5,35,11.0,D1
1,ryan mountford,13,Chicos,15,22,11,3,20,2.2,33,11.0,D1
2,Alex Bernstein,66,Bay Area Battalion,16,16,17,1,8,2.06,33,11.0,D1
3,Erik Davidson,9,Bonefish,18,15,18,1,16,1.83,33,11.0,D1
4,john arceo,5,SD 2.0,13,15,12,2,6,2.08,27,11.0,D1


In [7]:
# 5) Light data manipulation. Removing columns, create Points per Game metric, etc.
def data_manip(df):

    """
    Argument(s):
    A df at the grain of player-season.
    It it the outuput of the previous function call base_web_data()

    returns a manipulated df where a player's stats are aggregated to the player grain. 
    """
    # Cast as integers so the join below works (otherwise it won't recognize 5.0 as 5, etc.)
    # df['SeasonID'] = df['SeasonID'].astype(int) 
    # df['TeamID'] = df['TeamID'].astype(int)
    # Convert season IDs (#40) to Season Name (Fall 2017)
    df_final = pd.merge(left = df, right = season_dim, how = 'left', left_on = 'SeasonID', right_on = 'SeasonID')

    # Only select necessarry columns
    col = ['Name', '#', 'Team', 'GP', 'Goals', 'Ass.', 'Hat', 'Min', 'Pts/Game', 'Pts', 'SeasonID']

    df_final = df_final[col]
    df_final.drop(columns = ['Pts/Game', '#'], inplace = True)

    # Create a GPG and Pts per game metric. 
    df_final['GPG'] = df_final['Goals'] / df_final['GP']
    df_final['Pts_PG'] = df_final['Pts'] / df_final['GP']

    df_final['SeasonID'] = df_final['SeasonID'].astype(int) 
    # df_final['TeamID'] = df_final['TeamID'].astype(int)
    
    # Get team name from Team ID (GG 1, 3, etc.)
    # manip_df = pd.merge(left = df_final, right = team_dim, how = 'left', left_on = 'TeamID', right_on = 'TeamID')
    df_final['lastupdated'] = datetime.today().strftime('%Y-%m-%d')

    df_final = pd.merge(left = df_final, right = season_dim, how = 'left', left_on = 'SeasonID', right_on = 'SeasonID')

    # Ouput results to CSV
    df_final.to_csv('Output_data/OaklandHockeyData.csv', index = False)

    return df_final

df = data_manip(data)


In [11]:
df[df['Name'] == 'CHRISTOPHER MCALLISTER']['Goals'].sum()

68