# AUDL PROJECT - DATA SCRAPING 🕷

## I. Data Outline 🕸
- [Player Stats](https://theaudl.com/stats/player-stats) (G, PP, POS, etc.) for each player ✔
- [Team Stats](https://theaudl.com/stats/team) by year X team/opponent ✔
- [Team Game Stats](https://theaudl.com/stats/team-game-stats) - All teams, all seasons ✔
- [Individual Player Stats](https://theaudl.com/league/players)
    - Stats Per Game
    - Height / Weight (if available)
    - Defense / Offense (if available)
    - Handler / Cutter (if available)


In [9]:
# IMPORTS
import pandas as pd
import selenium
import os
import time

from selenium import webdriver
from selenium.webdriver.common.by import By

In [4]:
# SETUP FOR SCRAPING

def get_page_results(driver: webdriver.Chrome, webpage: str) -> pd.DataFrame:
    """
    Utilize a chrome webdriver to scrape data from an AUDL Stats page
    Save results of page to a CSV.
    input:
        - driver: Chrome webdriver, used as input to keep same driver while looping
        - webpage: The webpage from which the data will be scraped
        - save_path: the path to where the file will be saved
    output:
        - None
    """

    driver.get(webpage)
    time.sleep(2)

    df: pd.DataFrame = pd.read_html(driver.page_source)[0]
    return df


### Player All-Time Stats

In [11]:
# TODO: Update this section to be consistent with changes to above function
# SCRAPE PLAYER STATS PAGES
num_pages = 130
root_page = 'https://theaudl.com/stats/player-stats?page='

driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")

for i in range(num_pages):
    i_page = root_page + str(i+1) # +1 because range() starts @ 0

    get_page_results(driver=driver, webpage=i_page)

driver.close()
    


  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


In [12]:
save_file = '.\\DATA\\20220227_player_all_time.csv'
df = pd.read_csv(save_file)
print(f"SHAPE: {df.shape}")

df.sample(20)

SHAPE: (2558, 26)


Unnamed: 0,Player,G,PP,POS,SCR,AST,GLS,BLK,+/- ▼,Cmp,...,T,S,D,C,Hck,Hck%,Pul,OPP,DPP,MP
445,Nick Vogt,12,237,270,47,12,35,12,55,148,...,3,0,1,1,2,--,0,205,32,241
404,Ryan Purcell,47,1024,1224,156,124,32,20,60,1389,...,106,2,10,0,1,--,19,690,334,978
65,Ethan Beardsley,60,1395,1794,185,18,167,47,193,413,...,13,0,26,0,0,--,1,1071,324,1307
795,Joe Becker,21,285,167,47,18,29,7,28,87,...,19,0,7,0,0,--,2,227,58,114
2159,Jacob Shoyer,1,4,2,0,0,0,0,0,0,...,0,0,0,0,0,--,0,0,4,13
326,Kevin Underhill,18,501,589,100,60,40,14,73,778,...,40,4,1,0,0,--,29,379,122,452
725,Lance Blackstock,17,288,230,29,11,18,12,31,57,...,6,0,4,0,0,--,31,35,253,240
1542,Ted Schewe,13,145,148,18,5,13,1,7,65,...,9,0,3,0,3,--,0,77,68,184
1786,Steven Garlok,4,53,40,3,2,1,2,3,13,...,0,0,2,0,0,--,0,3,50,48
122,Chris Mazur,49,1065,1112,240,176,64,30,143,1389,...,117,2,10,0,5,--,162,762,303,892


### Team Stats

In [45]:
seasons= range(2014, 2015, 1)
team_root_page = 'https://theaudl.com/stats/team?year='
opp_root_page = 'https://theaudl.com/stats/team?opponent&year='

driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")

for i in seasons:
    i_team_page = team_root_page + str(i) # +1 because range() starts @ 0
    i_opp_page = opp_root_page + str(i)

    team_df = get_page_results(driver=driver, webpage=i_team_page)
    opp_df = get_page_results(driver=driver, webpage=i_opp_page)

    team_save_file = f".\\DATA\\{i}_TEAM_STATS.csv"
    opp_save_file = f".\\DATA\\{i}_OPPONENT_STATS.csv"

    if not os.path.isfile(team_save_file):
        team_df.to_csv(team_save_file, index=False)
    else:
        temp_df:pd.DataFrame = pd.read_csv(team_save_file)
        concat:pd.DataFrame = pd.concat([pd.read_csv(team_save_file), team_df])
        concat.drop_duplicates().to_csv(team_save_file, index=False)

    if not os.path.isfile(opp_save_file):
        team_df.to_csv(opp_save_file, index=False)
    else:
        temp_df:pd.DataFrame = pd.read_csv(opp_save_file)
        concat:pd.DataFrame = pd.concat([pd.read_csv(opp_save_file), opp_df])
        concat.drop_duplicates().to_csv(opp_save_file, index=False)

driver.close()

  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


In [35]:
opp_df

Unnamed: 0,Team,G,W ▼,L,S,SA,Opp C,Opp T,Opp CMP %,Opp H,Opp Huck %,Opp HLD %,Opp OLC %,Opp B,Opp BRK %,Opp DLC %,Opp RZC %
0,Empire,15,12,3,324,289,3882,248,94.0,92,57.5,68.5,57.11,115,16.46,40.0,78.97
1,Growlers,14,11,3,286,259,3629,288,92.65,90,55.56,65.38,49.51,127,19.5,39.57,75.57
2,Union,14,11,3,312,246,4088,267,93.87,85,59.86,60.77,48.13,111,14.23,40.86,73.04
3,Flyers,15,11,4,341,280,3930,287,93.19,86,54.78,62.47,50.65,122,15.61,40.52,79.43
4,Wind Chill,13,10,3,280,231,3102,275,91.86,82,58.16,60.07,47.23,111,20.63,37.96,73.6
5,Breeze,13,10,3,284,218,2961,258,91.99,80,54.05,57.81,45.79,104,18.33,42.72,82.14
6,Hustle,13,9,4,273,245,4053,240,94.41,86,57.72,62.8,49.86,109,22.22,49.18,81.35
7,Legion,13,8,5,261,241,2622,255,91.14,88,59.46,62.59,48.12,110,23.22,46.27,74.38
8,Radicals,12,7,5,257,222,3464,244,93.42,72,58.06,57.4,46.49,117,25.0,46.56,77.36
9,Royal,9,7,2,190,179,1801,187,90.59,75,79.79,67.65,52.67,94,21.24,36.61,82.54


### Team Game Stats

In [7]:
seasons = {
    2014: 13,
    2015: 19,
    2016: 20,
    2017: 18,
    2018: 18,
    2019: 14,
    2021: 14,
}

driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")

for season in seasons.keys():
    num_pages = seasons[season]
    season_save_file = f".\\DATA\\{season}SEASON_GAME_STATS.csv"

    for page in range(num_pages):

        stats_page = f'https://theaudl.com/stats/team-game-stats?page={page+1}&year={season}'
        season_games_df = get_page_results(driver=driver, webpage=stats_page)

        if not os.path.isfile(season_save_file):
            season_games_df.to_csv(season_save_file, index=False)
        else:
            temp_df:pd.DataFrame = pd.read_csv(season_save_file)
            concat:pd.DataFrame = pd.concat([pd.read_csv(season_save_file), season_games_df])
            concat.drop_duplicates().to_csv(season_save_file, index=False)



  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


### Individual Player Stats

In [97]:
# Iterate over pages of players
num_pages = 136
player_page_dict = {}

# Iterate over pages
for page in range(num_pages):

    # Navigate to webpage
    driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")
    driver.get(f"https://theaudl.com/league/players?page={page}")

    # Get table and list of rows
    table_el = driver.find_element(By.TAG_NAME,'table')
    row_el_list = table_el.find_elements(By.TAG_NAME, 'tr')

    # Iterate rows and copy player name and link to page to dictionary
    for row in row_el_list[1:]:
        player_name = row.find_elements(By.TAG_NAME, 'a')[0].text
        player_link = row.find_elements(By.TAG_NAME, 'a')[0].get_attribute('href')

        player_page_dict[player_name] = player_link

driver.close()

  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


In [98]:
# DATA SIZE
player_page_dict
print(f"TOTAL SIZE: {sys.getsizeof(player_page_dict)/1000}Kb")

TOTAL SIZE: 73.816Kb


In [99]:
# SAVE TO FILE
df = pd.DataFrame.from_dict(data=player_page_dict, orient='index', columns=['player_link'])
df.to_csv(".\\DATA\\20220304_player_page_links.csv")

Unnamed: 0,player_link
Leo Pepper,https://theaudl.com/league/players/lpepper
Justin Allen,https://theaudl.com/league/players/jallen
Bryson Simon-Fox,https://theaudl.com/league/players/bsimonfox
Dennis Bechis,https://theaudl.com/league/players/dbechis
Kuo Hsun Wang,https://theaudl.com/league/players/khsunwang
Max Trotter,https://theaudl.com/league/players/mtrotter
Nate Goff,https://theaudl.com/league/players/ngoff
JD Hastings,https://theaudl.com/league/players/jhastings
Jonah Malenfant,https://theaudl.com/league/players/jmalenfan
Luke Ryan,https://theaudl.com/league/players/lryan
