# AUDL PROJECT - DATA SCRAPING 🕷

## I. Data Outline 🕸
- [Player Stats](https://theaudl.com/stats/player-stats) (G, PP, POS, etc.) for each player ✔
- [Team Stats](https://theaudl.com/stats/team) by year X team/opponent ✔
- [Team Game Stats](https://theaudl.com/stats/team-game-stats) - All teams, all seasons ✔
- [Individual Player Stats](https://theaudl.com/league/players)
    - Stats Per Game
    - Height / Weight (if available)
    - Defense / Offense (if available)
    - Handler / Cutter (if available)


In [65]:
# IMPORTS
import pandas as pd
import selenium
import os
import time

from selenium import webdriver
from selenium.webdriver.common.by import By

In [66]:
# SETUP FOR SCRAPING

def get_page_results(driver: webdriver.Chrome, webpage: str) -> pd.DataFrame:
    """
    Utilize a chrome webdriver to scrape data from an AUDL Stats page
    Save results of page to a CSV.
    input:
        - driver: Chrome webdriver, used as input to keep same driver while looping
        - webpage: The webpage from which the data will be scraped
        - save_path: the path to where the file will be saved
    output:
        - None
    """

    driver.get(webpage)
    time.sleep(2)

    df: pd.DataFrame = pd.read_html(driver.page_source)[0]
    return df


### Player All-Time Stats

In [68]:
# SCRAPE PLAYER STATS PAGES
num_pages = 130
root_page = 'https://theaudl.com/stats/player-stats?page='
fname = '.\\DATA\\20220227_player_all_time.csv'

driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")

for i in range(num_pages):
    i_page = root_page + str(i+1) # +1 because range() starts @ 0

    results = get_page_results(driver=driver, webpage=i_page)

    if i == 0:
        results.to_csv(fname, index=False, header=False)
        continue

    results.to_csv(fname, index=False, header=False, mode='a')

driver.close()
del results


  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


In [70]:
# Sample
df = pd.read_csv(fname)
print(f"SHAPE: {df.shape}")
df.sample(20)

SHAPE: (60, 26)


Unnamed: 0,Player,G,PP,POS,SCR,AST,GLS,BLK,+/- ▼,Cmp,...,T,S,D,C,Hck,Hck%,Pul,OPP,DPP,MP
38,Sean Mott,71,1701,2216,383,236,147,54,244,1887,...,160,0,33,0,13,54.17,28,1381,320,1830
33,Alec Arsenault,60,1312,1567,285,88,197,65,256,672,...,68,2,26,0,1,--,0,921,391,1223
39,Steve Armitage,59,1025,1051,250,58,192,45,244,377,...,29,0,22,0,0,--,4,672,353,761
10,Jay Froude,70,1585,1823,329,147,182,116,350,863,...,80,1,15,2,2,--,80,814,771,1573
3,Jonathan Helton,110,2550,2288,609,362,247,147,458,2312,...,255,3,43,0,17,58.62,452,1738,812,1886
14,Beau Kittredge,80,1904,2154,332,128,204,109,337,1640,...,88,1,16,1,0,--,21,1200,704,1838
43,Anson Reppermund,76,1512,1349,153,66,87,145,231,489,...,58,1,9,1,2,--,142,385,1127,1452
0,Cameron Brock,122,2631,2838,756,240,516,49,585,1644,...,177,0,43,0,1,--,6,2204,427,2123
31,Justin Allen,83,1825,1932,336,178,158,77,265,972,...,130,0,18,1,1,--,438,846,979,1830
54,Zach Theodore,69,1371,1105,165,48,117,86,211,347,...,27,0,13,0,0,--,0,327,1044,1223


### Team Stats

In [45]:
seasons= range(2012, 2022, 1)
team_root_page = 'https://theaudl.com/stats/team?year='
opp_root_page = 'https://theaudl.com/stats/team?opponent&year='

driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")

for i in seasons:
    i_team_page = team_root_page + str(i) # +1 because range() starts @ 0
    i_opp_page = opp_root_page + str(i)

    team_df = get_page_results(driver=driver, webpage=i_team_page)
    opp_df = get_page_results(driver=driver, webpage=i_opp_page)

    team_save_file = f".\\DATA\\{i}_TEAM_STATS.csv"
    opp_save_file = f".\\DATA\\{i}_OPPONENT_STATS.csv"

    if not os.path.isfile(team_save_file):
        team_df.to_csv(team_save_file, index=False)
    else:
        temp_df:pd.DataFrame = pd.read_csv(team_save_file)
        concat:pd.DataFrame = pd.concat([pd.read_csv(team_save_file), team_df])
        concat.drop_duplicates().to_csv(team_save_file, index=False)

    if not os.path.isfile(opp_save_file):
        team_df.to_csv(opp_save_file, index=False)
    else:
        temp_df:pd.DataFrame = pd.read_csv(opp_save_file)
        concat:pd.DataFrame = pd.concat([pd.read_csv(opp_save_file), opp_df])
        concat.drop_duplicates().to_csv(opp_save_file, index=False)

driver.close()

  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


In [35]:
opp_df

Unnamed: 0,Team,G,W ▼,L,S,SA,Opp C,Opp T,Opp CMP %,Opp H,Opp Huck %,Opp HLD %,Opp OLC %,Opp B,Opp BRK %,Opp DLC %,Opp RZC %
0,Empire,15,12,3,324,289,3882,248,94.0,92,57.5,68.5,57.11,115,16.46,40.0,78.97
1,Growlers,14,11,3,286,259,3629,288,92.65,90,55.56,65.38,49.51,127,19.5,39.57,75.57
2,Union,14,11,3,312,246,4088,267,93.87,85,59.86,60.77,48.13,111,14.23,40.86,73.04
3,Flyers,15,11,4,341,280,3930,287,93.19,86,54.78,62.47,50.65,122,15.61,40.52,79.43
4,Wind Chill,13,10,3,280,231,3102,275,91.86,82,58.16,60.07,47.23,111,20.63,37.96,73.6
5,Breeze,13,10,3,284,218,2961,258,91.99,80,54.05,57.81,45.79,104,18.33,42.72,82.14
6,Hustle,13,9,4,273,245,4053,240,94.41,86,57.72,62.8,49.86,109,22.22,49.18,81.35
7,Legion,13,8,5,261,241,2622,255,91.14,88,59.46,62.59,48.12,110,23.22,46.27,74.38
8,Radicals,12,7,5,257,222,3464,244,93.42,72,58.06,57.4,46.49,117,25.0,46.56,77.36
9,Royal,9,7,2,190,179,1801,187,90.59,75,79.79,67.65,52.67,94,21.24,36.61,82.54


### Team Game Stats

In [7]:
# Number of pages of stats per season
seasons = {
    2014: 13,
    2015: 19,
    2016: 20,
    2017: 18,
    2018: 18,
    2019: 14,
    2021: 14,
}

driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")

for season in seasons.keys():
    num_pages = seasons[season]
    season_save_file = f".\\DATA\\{season}SEASON_GAME_STATS.csv"

    for page in range(num_pages):

        stats_page = f'https://theaudl.com/stats/team-game-stats?page={page+1}&year={season}'
        season_games_df = get_page_results(driver=driver, webpage=stats_page)

        if not os.path.isfile(season_save_file):
            season_games_df.to_csv(season_save_file, index=False)
        else:
            temp_df:pd.DataFrame = pd.read_csv(season_save_file)
            concat:pd.DataFrame = pd.concat([pd.read_csv(season_save_file), season_games_df])
            concat.drop_duplicates().to_csv(season_save_file, index=False)



  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


### Individual Player Stats

In [97]:
# Iterate over pages of players
num_pages = 136
player_page_dict = {}

# Iterate over pages
for page in range(num_pages):

    # Navigate to webpage
    driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")
    driver.get(f"https://theaudl.com/league/players?page={page}")

    # Get table and list of rows
    table_el = driver.find_element(By.TAG_NAME,'table')
    row_el_list = table_el.find_elements(By.TAG_NAME, 'tr')

    # Iterate rows and copy player name and link to page to dictionary
    for row in row_el_list[1:]:
        player_name = row.find_elements(By.TAG_NAME, 'a')[0].text
        player_link = row.find_elements(By.TAG_NAME, 'a')[0].get_attribute('href')

        player_page_dict[player_name] = player_link

driver.close()

  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


In [98]:
# DATA SIZE
player_page_dict
print(f"TOTAL SIZE: {sys.getsizeof(player_page_dict)/1000}Kb")

TOTAL SIZE: 73.816Kb


In [99]:
# SAVE TO FILE
df = pd.DataFrame.from_dict(data=player_page_dict, orient='index', columns=['player_link'])
df.to_csv(".\\DATA\\20220304_player_page_links.csv")

In [101]:
# Get individual player stats per game
test_link = "https://theaudl.com/league/players/lpepper"



Unnamed: 0,player_link
Leo Pepper,https://theaudl.com/league/players/lpepper
Justin Allen,https://theaudl.com/league/players/jallen
Bryson Simon-Fox,https://theaudl.com/league/players/bsimonfox
Dennis Bechis,https://theaudl.com/league/players/dbechis
Kuo Hsun Wang,https://theaudl.com/league/players/khsunwang
Max Trotter,https://theaudl.com/league/players/mtrotter
Nate Goff,https://theaudl.com/league/players/ngoff
JD Hastings,https://theaudl.com/league/players/jhastings
Jonah Malenfant,https://theaudl.com/league/players/jmalenfan
Luke Ryan,https://theaudl.com/league/players/lryan


In [4]:
player_df = pd.read_csv(".\\DATA\\20220304_player_page_links.csv")
player_df['player_link'].sample(3)

1148     https://theaudl.com/league/players/jrovner
2139        https://theaudl.com/league/players/rojo
55      https://theaudl.com/league/players/aalarcon
Name: player_link, dtype: object

Albert Alarcon of the New York Empire has the designation of 'Cutter' on his player page. The following is a function created to get that current team position text

In [78]:
def get_position_from_player_page(page_link: str, driver):

    position_xpath = '//*[@class="audl-player-current-team-position"]'

    if not driver:
        driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")
        
    driver.get(page_link)
    time.sleep(1)

    # Get table and list of rows
    position_element = driver.find_element(By.XPATH, position_xpath)
    position_text = position_element.text

    if position_text.__contains__('/'):
        team, position = position_text.split(' / ')
        return position
    
    return None
    

In [57]:
# Go to player link pages and get position if available
test_links = player_df['player_link'].values
driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")
count = 0

for row in range(2280,len(test_links)):
    link = player_df.at[row, 'player_link']
    position = get_position_from_player_page(page_link=link, driver=driver)
    
    if position:
        count+= 1
        player_df.at[row, 'position'] = position

driver.close()

print(f"TOTAL COUNT: {player_df[]} ({round(100*count/2699,2)})%")

  driver = webdriver.Chrome(".\\webdriver\\chromedriver_98.exe")


TOTAL COUNT: 77 (2.85)%


In [81]:
player_df[~pd.isna(player_df['position'])].sample(10)
player_df.to_csv(".\\DATA\\20220304_player_positions.csv")

Unnamed: 0.1,Unnamed: 0,player_link,position
761,Derek Alexander,https://theaudl.com/league/players/dalexande,Handler
899,Ethan Falat,https://theaudl.com/league/players/efalat,Defender
2539,Travis Dunn,https://theaudl.com/league/players/tdunn,Cutter
1842,Michael Jordan,https://theaudl.com/league/players/mjordan,Cutter
774,Dillon Tranquillo,https://theaudl.com/league/players/dtranquil,Defender
1977,Nick Ladas,https://theaudl.com/league/players/nladas,Defender
1748,Matt LeMar,https://theaudl.com/league/players/mlemar,Defender
842,Elijah Long,https://theaudl.com/league/players/elong,Handler
1099,Jackson Stearns,https://theaudl.com/league/players/jstearns,Cutter
1809,Max Sheppard,https://theaudl.com/league/players/msheppard,Cutter


In [62]:
# player_df.to_csv(".\\DATA\\20220304_player_positions.csv")
positions = pd.read_csv(".\\DATA\\20220304_player_positions.csv")
positions['position'].value_counts()

Cutter      161
Handler     137
Defender    120
Hybrid       71
Name: position, dtype: int64