<a href="https://colab.research.google.com/github/colinrsmall/ehm_roster_tools/blob/master/Draft_Sheet_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install beautifulsoup4 --upgrade



# Instructions:

To scrape leagues, add or remove links from the following "leagues" form field. The field should be a comma-separated string of links to either leagues or drafts from EliteProspects.

To change which season you're scraping for, change the "season" field following the list of leagues. The string should be of the format 'YYYY-YY' (such as '2019-20' or '2017-18').

The "contract_expiry_prefix" field should be set set as the day and month at which you expect players' contracts to expire in-game. For example, if you want all players scraped with this notebook to have their contracts expire on June 1st of a respective year, set the string to "1.6.XXXX".

If you want the scraper to print out links for players who are missing information on their EP page, change show_error_links to True.

If include_nhl_signed is ticked, players who are signed to an NHL team but are on loan (as listed in EP) will be listed as playing for the team that they are on loan to (and thus will not be listed as playing for their signed NHL team).

In [22]:
leagues = "https://www.eliteprospects.com/draft/cjhl-draft/2011" #@param {type:"string"}
leagues = leagues.split(',')

use_google_drive = False #@param {type:"boolean"}

In [23]:
if use_google_drive:
  from google.colab import drive
  drive.mount('/content/drive')
  drive_mounted = True

# Expand this if you want to look at the code (optional)

In [24]:
!mkdir '/content/leagues/'

mkdir: cannot create directory ‘/content/leagues/’: File exists


In [25]:
import requests, random, csv, traceback, time, urllib.request
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime, date, timedelta
from google.colab import files
from dateutil.relativedelta import relativedelta

In [26]:
def get_name(player_page):
  name = player_page.find('h1', class_="ep-entity-header__name").text.strip()
  if '.' in name:
    first_name = name[:name.rfind('.')+1]
    last_name = name[name.rfind('.')+1:].split('\n')[0].strip()
  else:
    name = name.split(' ')
    first_name = name[0]
    last_name = ' '.join(name[1:]).split('\n')[0].strip()
  return first_name, last_name


def get_dob(player_page):
  dob_search_text = """
                                        Date of Birth
                                    """
  try:
    dob_text = player_page.find('div', text=dob_search_text).next_element.next_element.next_element.text.strip()
    dob = datetime.strptime(dob_text, '%b %d, %Y').strftime('%-d.%-m.%Y')
  except ValueError as e:
    try:
      dob = datetime.strptime(dob_text, '%Y').strftime('%Y')
    except ValueError as e:
      dob = datetime.strptime(dob_text, '%b, %Y').strftime('%m.%Y')
  except Exception as e:
    dob = ""
    print(f'Missing dob information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return dob


def get_player_page(link):
    player_page = requests.get(link, headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'referrer': 'https://google.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Pragma': 'no-cache',
    })
    return BeautifulSoup(player_page.content)

In [27]:
def scrape():
  for league in tqdm(leagues, desc='Leagues'):
    draft = "draft" in league
    team = "team" in league

    home_page_link = league
    home_page_page = requests.get(home_page_link)
    print(home_page_link)
    home_page = BeautifulSoup(home_page_page.content)

    page_name = home_page.select(".semi-logo")[0].text.strip() if team else home_page.select(".plytitle")[0].text.strip()
    page_name = ' '.join(page_name.replace('\n', '').split()).strip()

    with open(f'/content/leagues/{page_name}_draft_history.csv', 'w+', newline='', encoding='UTF-8') as draft_history_file:
      draft_history_file.write('\ufeff')
      draft_history_writer = csv.writer(draft_history_file, delimiter=',')
      top_header = 'draft_history, dmy'
      second_header = 'Draft, Year, Round, Overall, Club, First Name, Second Name, Date of Birth'
      draft_history_writer.writerow(top_header.split(','))
      draft_history_writer.writerow(second_header.split(','))

      draft_round_tables = home_page.select('[data-sort-ajax-container="#drafted-players"] tbody')

      for round_number, table in enumerate(draft_round_tables):
        for pick in table.select("tr:not(.title)"):

          player_link = pick.select(".player a")[0]["href"]
          player_page = get_player_page(player_link)

          picked_player = pick.select(".player")[0].text.strip()
          first_name, last_name = get_name(player_page)
          dob = get_dob(player_page)

          overall = pick.select(".overall.sorted")[0].text.replace("#","").strip()
          club = pick.select(".team")[0].text.strip()
  
          draft_history_writer.writerow([page_name, home_page_link[-4:], round_number, overall, club, first_name, last_name, dob])

          time.sleep(random.random() * 3)

# Output

In [28]:
scrape()

Leagues:   0%|          | 0/1 [00:00<?, ?it/s]

https://www.eliteprospects.com/draft/cjhl-draft/2011


In [29]:
time.sleep(5)
!zip leagues.zip -r '/content/leagues/'

if use_google_drive:
  !mv leagues.zip /content/drive/MyDrive/leagues.zip

  adding: content/leagues/ (stored 0%)
  adding: content/leagues/2011 CJHL Draft_draft_history.csv (deflated 68%)
