In [12]:
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


from tqdm import tqdm

import pandas as pd
from deltalake.writer import write_deltalake
service = Service(executable_path="/snap/bin/chromium.chromedriver")

from selenium.webdriver.chrome.options import Options
options = Options()
#options.add_argument("start-maximized")
#options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
#options.add_argument('--headless')
options.add_argument("--disable-dev-shm-usage")
#options.add_argument("--no-sandbox")


In [3]:
def get_ladder(driver, season: int = 2022, round: int = 1, comp: int = 111) -> pd.DataFrame:
    '''
    Get ladder as at end of specified Round.

    year: starting year of competition
    round: round of competitition
    comp: competition (111 = NRL)
    '''
    url = f'https://www.nrl.com/ladder/?competition={comp}&round={round}&season={season}'    
    driver.get(url)
    table_elem = driver.find_element(by=By.CLASS_NAME, value="ladder-page__ladder-inner")
    ladder_html = table_elem.get_attribute('innerHTML')
    # read_html returns list and we're intentionally only giving it one table to read so just grab 0-th to get dataframe
    ladder_df = pd.read_html(ladder_html)[0]
    ladder_df_slim = ladder_df.drop(['Pos', 'Team', 'Next', 'home', 'away', 'form'], axis=1)
    ladder_df_slim.columns = ['Pos', 'team', 'played', 'points', 'wins', 'drawn', 'lost',
                              'byes', 'for', 'against', 'diff.']
    ladder_df_slim.columns = [c.lower() for c in ladder_df_slim.columns]
    return ladder_df_slim


In [4]:
driver = webdriver.Chrome(service=service)
ldf = get_ladder(driver)
driver.quit()


In [5]:
ldf

Unnamed: 0,pos,team,played,points,wins,drawn,lost,byes,for,against,diff.
0,1,Panthers,1,2,1,0,0,0,28,6,22
1,2,Knights,1,2,1,0,0,0,20,6,14
2,3,Dragons,1,2,1,0,0,0,28,16,12
3,4,Storm,1,2,1,0,0,0,26,16,10
4,5,Broncos,1,2,1,0,0,0,11,4,7
5,6,Raiders,1,2,1,0,0,0,24,19,5
6,7,Eels,1,2,1,0,0,0,32,28,4
7,8,Bulldogs,1,2,1,0,0,0,6,4,2
8,9,Cowboys,1,0,0,0,1,0,4,6,-2
9,10,Titans,1,0,0,0,1,0,28,32,-4


In [7]:
def get_round_list(driver, season: int = 2022, comp: int = 111) -> pd.DataFrame:
    '''
    year: starting year of competition
    round: round of competitition
    comp: competition (111 = NRL)

    '''
    # do hacks until things cause me enough headaches, there should be some sort of wait.until() thing here
    driver.implicitly_wait(2)
    url = f"https://www.nrl.com/draw/?competition={comp}&season={season}"
    driver.get(url)
    rounds = driver.find_elements(by=By.CLASS_NAME, value="filter-dropdown-item--round")
    rounds_dict = {}
    for i, name in enumerate(rounds):
        rounds_dict[i] = name.find_element(by=By.TAG_NAME, value='div').get_attribute('innerHTML').splitlines()[1].strip()
    round_list_df = pd.DataFrame.from_dict(rounds_dict, orient='index', columns=['round_name'])
    round_list_df = round_list_df.assign(season=season)
    round_list_df = round_list_df.assign(round_num=round_list_df.loc[round_list_df.round_name.str.contains('Round'), 'round_name'].str.split('Round ', expand=True)[1].astype(int))
    return round_list_df


def get_round_games(driver, season: int = 2022, round: int = 1, comp: int = 111) -> pd.DataFrame:
    #driver.implicitly_wait(2)
    wait = WebDriverWait(driver, 10)
    url = f"https://www.nrl.com/draw/?competition={comp}&round={round}&season={season}"
    driver.get(url)

    games = wait.until(lambda d: d.find_elements(by=By.CLASS_NAME, value="l-grid"))
    
    # 0-th element of list is 'draw' text from top of page, ignore that
    games_list = []
    for g in games[1:-2]:
        game_dict = {}
        game_dict['season'] = season
        game_dict['round'] = round
        venue = g.find_element(by=By.CLASS_NAME, value="match-venue").text.splitlines()[1]
        date = g.find_element(by=By.CLASS_NAME, value="match-header__title").text
        teams = g.find_elements(by=By.CLASS_NAME, value='match-team')
        game_dict['venue'] = venue
        game_dict['date'] = date
        for team in teams:
            home_away, team_name = team.find_element(by=By.CLASS_NAME, value="match-team__info").text.splitlines()[:2]
            try:
                points = team.find_element(by=By.CLASS_NAME, value="match-team__score").text.splitlines()[1]
            except IndexError:
                points = None
            game_dict[f'{home_away}_name'] = team_name 
            game_dict[f'{home_away}_points'] = points
        games_list.append(game_dict)    
    games_df = pd.DataFrame(games_list, ) 
    games_df.columns = ['season', 'round', 'venue', 'date', 'home_team', 'home_team_points', 'away_team', 'away_team_points']
    games_df = games_df.assign(date=pd.to_datetime(games_df.date + ' ' + games_df.season.astype(str)).dt.date)
    return games_df

In [10]:
driver = webdriver.Chrome(service=service)
#driver.implicitly_wait(1)
seasons = range(2018, 2024, 1)
seasons_dfs = []
for s in seasons:
    y = get_round_list(driver, season=s)
    seasons_dfs.append(y)
driver.quit()
all_rounds = pd.concat(seasons_dfs)

In [14]:
driver = webdriver.Chrome(service=service)
round_dfs = []
ladder_dfs = []
for y, r in tqdm(all_rounds.loc[all_rounds.round_name.str.contains("Round"), ['season', 'round_num']].values):
    round_dfs.append(get_round_games(driver, season=int(y), round=int(r)))
driver.quit()

100%|██████████| 147/147 [01:58<00:00,  1.24it/s]


In [16]:
results = pd.concat(round_dfs)
# current delta python bindings don't support pandas datetime64[ns], I can't see easy way to convert it other than making dates in to strings (eww)
#write_deltalake("./data/game_results.delta", results)

results.to_parquet('../data/raw/game_results.parquet')

In [18]:
results.sample(10)

Unnamed: 0,season,round,venue,date,home_team,home_team_points,away_team,away_team_points
6,2023,4,"McDonald Jones Stadium, Newcastle",2023-03-26,Knights,,Raiders,
0,2022,16,"4 Pines Park, Sydney",2022-06-30,Sea Eagles,36.0,Storm,30.0
4,2018,12,"Mt Smart Stadium, Auckland",2018-05-26,Warriors,10.0,Rabbitohs,30.0
5,2019,3,"Carrington Park, Bathurst",2019-03-30,Panthers,2.0,Storm,32.0
1,2018,16,"Mt Smart Stadium, Auckland",2018-06-29,Warriors,15.0,Sharks,18.0
0,2023,13,"Kayo Stadium, Redcliffe",2023-05-25,Dolphins,,Dragons,
6,2018,22,"GIO Stadium, Canberra",2018-08-12,Raiders,20.0,Wests Tigers,22.0
2,2021,12,"CommBank Stadium, Sydney",2021-05-28,Wests Tigers,34.0,Dragons,18.0
7,2022,21,"Campbelltown Sports Stadium, Sydney",2022-08-07,Wests Tigers,10.0,Knights,14.0
3,2023,10,"Suncorp Stadium, Brisbane",2023-05-06,Sharks,,Dolphins,
