In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import os
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import itertools
import time
import re
import logging
import json
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [2]:
class Parser(object):
    def __init__(self, driver):
        self.driver = driver

    def _get_element(self, e):
        xpath = self.xpath_soup(e)
        return self.driver.find_element_by_xpath(xpath)
    
    def xpath_soup(self, element):
        components = []
        child = element if element.name else element.parent
        for parent in child.parents:
            """
            @type parent: bs4.element.Tag
            """
            previous = itertools.islice(parent.children, 0, parent.contents.index(child))
            xpath_tag = child.name
            xpath_index = sum(1 for i in previous if i.name == xpath_tag) + 1
            components.append(xpath_tag if xpath_index == 1 else '%s[%d]' % (xpath_tag, xpath_index))
            child = parent
        components.reverse()
        return '/%s' % '/'.join(components)
    
    @property
    def soup(self):
        return BeautifulSoup(self.driver.page_source, "html.parser")


    

In [3]:
def create_driver():
    options = FirefoxOptions()
    driver = webdriver.Firefox(
        service_log_path=os.path.devnull,
        options=options,
        executable_path=os.path.join(
            "golfgenius", "drivers", "firefox", "0.28", "geckodriver"))
    driver.set_window_size(1920, 1080)
    return driver

def sign_in(parser, ggid="nzxmej"):
    parser.driver.get("https://www.golfgenius.com/golfgenius")
    time.sleep(2)
    login_button = parser._get_element(parser.soup.find('a', text='SIGN IN'))
    login_button.click()
    time.sleep(1)
    ggid_input = parser._get_element(parser.soup.find('input', {u"placeholder": u"Enter Your GGID", u"type": u"text"}))
    ggid_input.clear()
    ggid_input.send_keys(ggid)
    sign_in_button = parser._get_element(parser.soup.find('input', type="submit", value="Sign In"))
    sign_in_button.click()
    time.sleep(5)
    sign_in_button2 = parser._get_element(parser.soup.find('input', type="submit", value="Sign In"))
    sign_in_button2.click()

In [4]:
d = Parser(create_driver())

In [29]:
results_page = "https://www.golfgenius.com/leagues/7021866105153037134/widgets/tournament_results"
d.driver.get(results_page)
regex = re.compile('\/v2tournaments\/(\d+)')
links = []
for o in d.soup.find(id='round').find_all('option'):
    d._get_element(o).click()
    WebDriverWait(d.driver, 15).until(
        EC.visibility_of_element_located(
        (By.XPATH, "//a[@class='expand-tournament']")))
    for l in d.soup.find_all('a', {"class": "expand-tournament", "data-tournament-spec-id": True}):
        eid = regex.search(l.attrs["href"]).group(1)
        links.append("https://www.golfgenius.com/tournaments2/details?adjusting=false&event_id=%s" % eid)

results = []
for l in links:
    d.driver.get(l)
    WebDriverWait(d.driver, 15).until(
            EC.visibility_of_element_located(
                (By.XPATH, "//table[@class='scorecard']")))
    table = d.soup.find('table', {"class": "scorecard"})
    teams = [[x.strip() for x in tr.attrs["data-aggregate-name"].split("+")] for tr in table.find_all("tr", {"class": "aggregate_score", "data-aggregate-name": True})]
    results.append({"table": table, "teams": teams})


In [31]:
len(results)

92

In [33]:
results[0]['table']

<table class="scorecard" name="Regular">
<tbody><tr class="header_row" style="">
<th colspan="100">
Strokes
</th>
</tr>
<tr class="header_row tee_header_row" style="">
<td style=""></td>
<td class=""> 1</td>
<td class=""> 2</td>
<td class=""> 3</td>
<td class=""> 4</td>
<td class=""> 5</td>
<td class=""> 6</td>
<td class=""> 7</td>
<td class=""> 8</td>
<td class=""> 9</td>
<td style="">Out</td>
<td class="" style="text-align: center;">
10
</td>
<td class="" style="text-align: center;">
11
</td>
<td class="" style="text-align: center;">
12
</td>
<td class="" style="text-align: center;">
13
</td>
<td class="" style="text-align: center;">
14
</td>
<td class="" style="text-align: center;">
15
</td>
<td class="" style="text-align: center;">
16
</td>
<td class="" style="text-align: center;">
17
</td>
<td class="" style="text-align: center;">
18
</td>
<td style="">In</td>
<td style="">
Total
</td>
</tr>
<tr class="net-line" data-net-name="Alford, Sumner">
<td class="name left_aligned">
<a cla

In [36]:
d.driver.get("https://www.golfgenius.com/tournaments2/details?adjusting=false&event_id=7319983698856682065")
d.soup.find('table', {"class": "scorecard"})

<table class="scorecard" name="Regular">
<tbody><tr class="header_row" style="">
<th colspan="100">
Strokes
</th>
</tr>
<tr class="header_row tee_header_row" style="">
<td style=""></td>
<td class=""> 1</td>
<td class=""> 2</td>
<td class=""> 3</td>
<td class=""> 4</td>
<td class=""> 5</td>
<td class=""> 6</td>
<td class=""> 7</td>
<td class=""> 8</td>
<td class=""> 9</td>
<td style="">Out</td>
<td class="" style="text-align: center;">
10
</td>
<td class="" style="text-align: center;">
11
</td>
<td class="" style="text-align: center;">
12
</td>
<td class="" style="text-align: center;">
13
</td>
<td class="" style="text-align: center;">
14
</td>
<td class="" style="text-align: center;">
15
</td>
<td class="" style="text-align: center;">
16
</td>
<td class="" style="text-align: center;">
17
</td>
<td class="" style="text-align: center;">
18
</td>
<td style="">In</td>
<td style="">
Total
</td>
</tr>
<tr class="net-line" data-net-name="Alford, Sumner">
<td class="name left_aligned">
<a cla

In [6]:

    
def get_teams_and_tables(option, element):
    results = {}
    round_name, round_id = option.text.strip(), option.attrs["value"]
    results['round_name'] = round_name
    results['round_id'] = round_id
    results['tournaments'] = []
    element.click()
    tournament_links = ["https://www.golfgenius.com" + t.attrs["href"] for t in d.soup.find_all('a', {"class": "expand-tournament", "data-tournament-spec-id": True})]
    for link in tournament_links:
        d.driver.get(link)
        WebDriverWait(
            d.driver, 10).until(EC.visibility_of_element_located(
            (By.XPATH, "//a[@class='expand-all']")))
        d._get_element(d.soup.find('a', {"class": "expand-all"})).click()
        WebDriverWait(d.driver, 15).until(
            EC.visibility_of_element_located(
                (By.XPATH, "//table[@class='scorecard']")))
        table = d.soup.find('table', {"class": "scorecard"})
        teams = [[x.strip() for x in tr.attrs["data-aggregate-name"].split("+")] for tr in table.find_all("tr", {"class": "aggregate_score", "data-aggregate-name": True})]
        results['tournaments'].append({"teams": teams, "table": table})
    return results
    
    #tournaments = d.soup.find_all('a', {"class": "expand-tournament", "data-tournament-spec-id": True})
    #for tournament in tournaments:
    #    print("https://www.golfgenius.com" + tournament.attrs["href"])
    #    t_id = tournament.attrs["data-tournament-event-id"]
    #    d._get_element(tournament).click()
    #    WebDriverWait(
    #        d.driver, 10).until(EC.visibility_of_element_located(
    #            (By.XPATH, "//a[@class='expand-all']")))
    #    d._get_element(d.soup.find('a', {"class": "expand-all"})).click()
    #    WebDriverWait(d.driver, 15).until(
    #        EC.visibility_of_element_located(
    #            (By.XPATH, "//table[@class='scorecard']")))
    #    table = d.soup.find('table', {"class": "scorecard"})
    #    teams = [[x.strip() for x in tr.attrs["data-aggregate-name"].split("+")] for tr in table.find_all("tr", {"class": "aggregate_score", "data-aggregate-name": True})]
    #    results['tournaments'].append({"teams": teams, "table": table})
    #    d._get_element(d.soup.find('a', {"class": "collapse-all", "data-event-id": t_id})).click()
    #    WebDriverWait(
    #        d.driver, 10).until(EC.element_to_be_clickable(
    #            (By.XPATH, "//a[@class='expand-all']")))
    #    d._get_element(tournament).click()
    #    time.sleep(1.5)
    #return results
        


In [7]:
def get_rounds(d):
    results_link = d.soup.find('a', text=re.compile(r"\s*Results\s*"))
    d._get_element(results_link).click()
    time.sleep(2)
    d.soup.find('iframe', {"name": "page_iframe"}).attrs["src"]
    d.driver.switch_to.frame("page_iframe")
    round_options = d.soup.find(id='round').find_all('option')
    items = []
    for o in round_options:
        items.append((o, d._get_element(o)))
    
    results = []
    for option, element in items:
        print(option.text.strip(), option.attrs)
        results.append(get_teams_and_tables(option, element))
    return results

In [8]:
results = get_rounds(d)

Round 26 (Sun, April 11) {'selected': 'selected', 'value': '7319983582422803361'}
Round 25 (Fri, April  9) {'value': '7314458308088210270'}


StaleElementReferenceException: Message: The element reference of <option> is stale; either the element is no longer attached to the DOM, it is not in the current frame context, or the document has been refreshed


In [26]:
d.driver.get(links[0])
WebDriverWait(d.driver, 15).until(
            EC.visibility_of_element_located(
                (By.XPATH, "//table[@class='scorecard']")))
table = d.soup.find('table', {"class": "scorecard"})
teams = [[x.strip() for x in tr.attrs["data-aggregate-name"].split("+")] for tr in table.find_all("tr", {"class": "aggregate_score", "data-aggregate-name": True})]
        

In [28]:
teams

[['Alford, Sumner', 'Fish, Tony', 'Welton, Craig', 'Zogby, Kevin'],
 ['Smith, Brian', 'Samuel, Matthew', 'Beaird, Ray', 'Stefanacci, Michael'],
 ['Perry, Robbie', 'Whartenby, Robert', 'Capwell, Robert', 'Shoffner, Chris']]

In [21]:
import re
regex = re.compile('\/v2tournaments\/(\d+)')
regex.search(links[0]).group(1)


'7319983677415400015'

In [21]:
d.soup.find_all('a', {"class": "expand-tournament", "data-tournament-spec-id": True})

[<a class="expand-tournament" data-remote="true" data-tournament-event-id="7319983677415400015" data-tournament-spec-id="7319983601280394076" href="/v2tournaments/7319983677415400015?round_index=26">Two Best Ball Front
 </a>,
 <a class="expand-tournament" data-remote="true" data-tournament-event-id="7319983694024843856" data-tournament-spec-id="7319983618695144285" href="/v2tournaments/7319983694024843856?round_index=26">Two Best Ball Back
 </a>,
 <a class="expand-tournament" data-remote="true" data-tournament-event-id="7319983698856682065" data-tournament-spec-id="7319983625573802846" href="/v2tournaments/7319983698856682065?round_index=26">Two Best Ball Overall
 </a>,
 <a class="expand-tournament" data-remote="true" data-tournament-event-id="7319983699225780818" data-tournament-spec-id="7319983632687342431" href="/v2tournaments/7319983699225780818?round_index=26">Individual Skins
 </a>]

In [22]:
d.soup.find(id='round').find_all('option')

[<option selected="selected" value="7319983582422803361">Round 26 (Sun, April 11)</option>,
 <option value="7314458308088210270">Round 25 (Fri, April  9)</option>,
 <option value="7294138215475225239">Round 24 (Fri, April  2)</option>,
 <option value="7285571483303191080">Round 23 (Tue, March 30)</option>,
 <option value="7273395674543072110">Round 22 (Fri, March 26)</option>,
 <option value="7268048562833892177">Round 21 (Wed, March 24)</option>,
 <option value="7224730243670516250">Round 20 (Fri, March 12)</option>,
 <option value="7224730084958052887">Round 17 (Tue, March  9)</option>,
 <option value="7218556708849755121">Round 16 (Sun, March  7)</option>,
 <option value="7210116988792775655">Round 15 (Fri, March  5)</option>,
 <option value="7210116900846609382">Round 14 (Thu, March  4)</option>,
 <option value="7198509549899260679">Round 13 (Sun, February 28)</option>,
 <option value="7186573493494915771">Round 12 (Thu, February 25)</option>,
 <option value="7186573446753591994">R