<a href="https://colab.research.google.com/github/colinrsmall/ehm_roster_tools/blob/master/EP_Roster_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTANT DO NOT SKIP

If the below cell prints out in red text "YOU MUST RESTART YOUR RUNTIME", click on the Runtime drop down menu and click "Restart runtime" before re-running the scraper

In [1]:
!pip install beautifulsoup4 --upgrade

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: beautifulsoup4
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.11.2
    Uninstalling beautifulsoup4-4.11.2:
      Successfully uninstalled beautifulsoup4-4.11.2
Successfully installed beautifulsoup4-4.12.2


# Instructions:

To scrape leagues, add or remove links from the following "leagues" form field. The field should be a comma-separated string of links to either leagues or drafts from EliteProspects.

To change which season you're scraping for, change the "season" field following the list of leagues. The string should be of the format 'YYYY-YY' (such as '2019-20' or '2017-18').

The "contract_expiry_prefix" field should be set set as the day and month at which you expect players' contracts to expire in-game. For example, if you want all players scraped with this notebook to have their contracts expire on June 1st of a respective year, set the string to "1.6.XXXX".

If you want the scraper to print out links for players who are missing information on their EP page, change show_error_links to True.

If include_nhl_signed is ticked, players who are signed to an NHL team but are on loan (as listed in EP) will be listed as playing for the team that they are on loan to (and thus will not be listed as playing for their signed NHL team).

In [2]:
leagues = "https://www.eliteprospects.com/team/803/univ.-of-michigan" #@param {type:"string"}
leagues = leagues.split(',')

season = "2023-24" #@param {type:"string"}
contract_expiry_prefix = "30.4.XXXX" #@param {type:"string"}
show_error_links = True #@param {type:"boolean"}
make_junior_contracts_to_age_20 = True #@param {type:"boolean"}
scrape_international_games = True #@param {type:"boolean"}
skip_players_with_blank_dobs = True #@param {type:"boolean"}
use_google_drive = False #@param {type:"boolean"}
calculate_remaining_eligible_years = True #@param {type:"boolean"}
override_contract_for_nhl_prospects = True #@param {type:"boolean"}
leagues_for_eligibility = "NCAA" #@param {type:"string"}
nhl_contracts = 'Include and set NHL team to playing' #@param ["Skip players with NHL contracts", "Include and set NHL team to playing", "Keep current team as playing"]


In [3]:
if use_google_drive:
  from google.colab import drive
  drive.mount('/content/drive')
  drive_mounted = True

# Expand this if you want to look at the code (optional)

In [4]:
!mkdir '/content/leagues/'

In [5]:
import requests, random, csv, traceback, time, urllib.request
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime, date, timedelta
from google.colab import files
from dateutil.relativedelta import relativedelta

In [16]:
def get_full_team_name(team_link):
  page = requests.get(team_link)
  page = BeautifulSoup(page.content)
  name = page.select(".semi-logo")[0].text.strip()
  return name.split('\n')[0]

def correct_nation_text(nation_text):
  if nation_text in ["U.K.", "England", "Wales", "Northern Ireland"]:
    return "Great Britain"
  elif nation_text == "CzechRep.":
    return "Czech Republic"
  else:
    return nation_text

def get_name(player_page):
  name = player_page.find('h1', class_="ep-entity-header__name").text.strip()
  if '.' in name:
    first_name = name[:name.rfind('.')+1]
    last_name = name[name.rfind('.')+1:].split('\n')[0].strip()
  else:
    name = name.split(' ')
    first_name = name[0]
    last_name = ' '.join(name[1:]).split('\n')[0].strip()
  return first_name, last_name


def find_season(tag):
  return season in tag.text and tag.has_attr('class') and tag['class'] == ['season', 'sorted']


def get_league(player_page):
  season_text = f"""
                                                                                    {season}
                                                                                                    """
  try:
    league = player_page.find(find_season).next_sibling.next_sibling.next_sibling.next_sibling.text.strip()
  except Exception as e:
    league = ""
    print(f'Player not found in league: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return league


def get_team_playing(player_page):
  season_text = f"""
                                                                                    {season}
                                                                                                    """
  try:
    team = player_page.find(find_season).next_element.next_element.next_element.next_element.text.strip()
    if team == "":
      raise Exception()
    if '“A”' in team or '“C”' in team:
      team = team[:-4]
  except Exception as e:
    try:
      team = player_page.find(find_season).next_element.next_element.next_element.next_element.next_element.next_element.text.strip()
      if team == "":
        raise Exception()
      if '“A”' in team or '“C”' in team:
        team = team[:-4]
    except Exception as e:
      try:
        team = player_page.find(find_season).next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text.strip()
        if team == "":
          raise Exception()
        if '“A”' in team or '“C”' in team:
          team = team[:-4]
      except Exception as e:
        print(e)
        team = ""
        print(f'Missing team information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return team


def get_dob(player_page):
  dob_search_text = """
                                        Date of Birth
                                    """
  try:
    dob_text = player_page.find('div', text=dob_search_text).next_element.next_element.next_element.text.strip()
    dob = datetime.strptime(dob_text, '%b %d, %Y').strftime('%-d.%-m.%Y')
  except ValueError as e:
    try:
      dob = datetime.strptime(dob_text, '%Y').strftime('1.01.%Y') + "missing"
    except ValueError as e:
      dob = datetime.strptime(dob_text, '%b, %Y').strftime('1.%m.%Y') + "missing"
  except Exception as e:
    dob = ""
    print(f'Missing dob information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return dob


def get_birthplace(player_page):
  place_of_birth_text = """
                                        Place of Birth
                                    """
  birth_place_text = player_page.find('div', text=place_of_birth_text).next_element.next_element.next_element.text.strip()
  birth_place = birth_place_text.replace(', ', ':').lower()
  return birth_place


def get_nations(player_page):
  nation_text = """
                                        Nation
                                    """
  nation_text = "".join(player_page.find('div', text=nation_text).next_element.next_element.next_element.text.split()).split('/')
  if len(nation_text) > 1:
    primary_nation = correct_nation_text(nation_text[0])
    secondary_nation = correct_nation_text(nation_text[1])
    declared_nation = 'Primary Nationality'
  else:
    primary_nation = correct_nation_text(nation_text[0])
    secondary_nation = '[None]'
    declared_nation = '[None]'
  return primary_nation, secondary_nation, declared_nation


# def calculate_chl_eligibility(dob):

def get_years_remaining(player_page, leagues, remaining_years = 4):
  for row in player_page.select("#league-stats tr"):
    # if row.select(".league")[0].text.strip() == "NCAA" and row.select(".regular.gp")[0].text.strip() != "-" and int(row.select(".regular.gp")[0].text.strip()) > 0:
    if row.select(".league")[0].text.strip() in leagues:
      remaining_years -= 1
  return remaining_years


def get_contracted_team(player_page):
  if calculate_remaining_eligible_years:
    remaining_years = get_years_remaining(player_page, leagues_for_eligibility)
    contract_expiry = contract_expiry_prefix[:-4] + str(2000 + int(season[5:]) + remaining_years)

    try:
      transfer = ""
      for t in player_page.select(".transfer:not(.loan)"):
        if not t.select("[title=Try-out]"):
          transfer=t
          break
      if transfer == "":
        contracted_team = get_team_playing(player_page)
        return contract_expiry, contracted_team, ""
      else:
        contracted_team = transfer.select(".to > a")[0]['href']
        contracted_team = get_full_team_name(contracted_team)
        join_date_text = transfer.select(".date")[0].text.strip()
    except Exception as e:
      contracted_team = ""

    # Get join date by finding the first year a player played in a league that is
    # subject to eligibility (NCAA, USports, etc.)
    for row in player_page.select("#league-stats tr"):
      if row.select(".league")[0].text.strip() in leagues_for_eligibility:
        join_year = row.select(".season")[0].text.strip().split("-")[0]
        join_date = f"1.7.{join_year}"
        break

  else:
    player_dob = get_dob(player_page)

    if "missing" in player_dob:
      player_birth_year = player_dob.split("missing")[0][-4:]
      player_age = datetime.now().year - int(player_birth_year)
    else:
      player_dob = datetime.strptime(player_dob, '%d.%m.%Y')
      player_age = relativedelta(datetime.now(),player_dob).years


    try:
      contract_text = """
                                        Contract
                                    """

      status_text = """
                                        Contract
                                    """
      try:
        contract_expiry_text = player_page.find('div', text=contract_text).next_element.next_element.next_element.text.strip()
      except Exception:
        raise Exception("Error finding contract expiry text - the player might be retired so this may not be an error.")

      if "Try-out" in contract_expiry_text:
        contract_expiry = '1.2.1900'
        contracted_team = "try-out"
        join_date = ""
        return contract_expiry, contracted_team, join_date

      elif "Junior" in contract_expiry_text or (player_age <= 20 and "-" in contract_expiry_text):
        if make_junior_contracts_to_age_20:
          contract_expiry = contract_expiry_prefix[:-4] + "20" + str(int(season.split("-")[1]) + 20 - player_age)
        else:
          contract_expiry = "1.2.1900"

      elif "Retired" in contract_expiry_text:
        raise Exception("Player is retired - ignore this error.")
      elif "+" in contract_expiry_text:
        plus_year = contract_expiry_text.split("+")[1]
        contract_expiry_text = contract_expiry_text.split("+")[0]
        contract_expiry = contract_expiry_prefix[:-4] + "20" + str(int(contract_expiry_text.split("/")[1])+int(plus_year))
      elif "-" in contract_expiry_text:
        contract_expiry = contract_expiry_prefix[:-4] + "20" + season.split("-")[1]
      else:
        contract_expiry = contract_expiry_prefix[:-4] + "20" + contract_expiry_text.split("/")[1]

      if override_contract_for_nhl_prospects:
        try:
          nhl_rights_search_string = """
                                                NHL Rights
                                            """

          nhl_rights_text = player_page.find('div', text=nhl_rights_search_string).next_element.next_element.next_element.text.strip()

          if "Signed" in nhl_rights_text:
            join_date = ""
            contracted_team = nhl_rights_text.split("/")[0].strip()
            return contract_expiry, contracted_team, join_date
        except Exception as e:
          pass

      try: # Player has a team listed in their transfer history
        transfer = ""
        for t in player_page.select(".transfer:not(.loan):not(up-down)"):
          if not t.select("[title=Try-out]"):
            transfer=t
            break
        if transfer == "":
          contracted_team = get_team_playing(player_page)
          return contract_expiry, contracted_team, ""
        else:
          contracted_team = transfer.select(".to > a")[0]['href']
          join_date_text = transfer.select(".date")[0].text.strip()

        try:
          join_date = datetime.strptime(join_date_text, '%m/%d/%Y').strftime('%d.%m.%Y')
        except Exception:
          try:
            join_date = datetime.strptime(join_date_text, '%m-%d-%Y').strftime('%d.%m.%Y')
          except Exception:
            try:
              join_date = datetime.strptime(join_date_text, '%Y-%m-%d').strftime('%d.%m.%Y')
            except Exception:
              join_date = datetime.strptime(join_date_text, '%d/%m/%Y').strftime('%d.%m.%Y')

      except IndexError: # Player does not have a team listed in their transfer history
        contracted_team = player_page.select(".ep-entity-header__text > a")[0]['href']
        join_date = ""
      contracted_team = get_full_team_name(contracted_team)

    except Exception as e:
      # contract_expiry = '1.2.1900'
      # contracted_team = ""
      # join_date = ""
      raise e

  return contract_expiry, contracted_team, join_date


def get_height_weight(player_page):
  height_text = """
                                        Height
                                    """
  weight_text = """
                                        Weight
                                    """
  try:
    heights = player_page.find('div', text=height_text).next_element.next_element.next_element.text.split('/')
    height = heights[1 if "cm" in heights[1] else 0].split(' cm')[0].strip()
  except Exception as e:
    height = ""
    print(f'Missing height information: {get_name(player_page)[0]} {get_name(player_page)[1]}')

  try:
    weights = player_page.find('div', text=weight_text).next_element.next_element.next_element.text.split('/')
    weight = weights[1 if " kg" in weights[1] else 0].split(' kg')[0].strip()
  except Exception as e:
    weight = ""
    print(f'Missing weight information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return height, weight


def get_handedness(player_page):
  shoots_text = """
                                        Shoots
                                    """
  catches_text = """
                                        Catches
                                    """
  try:
    shoots = player_page.find('div', text=shoots_text).next_element.next_element.next_element.text.strip()
  except Exception:
    shoots = player_page.find('div', text=catches_text).next_element.next_element.next_element.text.strip()
  return shoots


def check_nhl_team(contracted_team):
  nhl_teams = ['Anaheim Ducks', 'Arizona Coyotes', 'Boston Bruins', 'Buffalo Sabres', 'Calgary Flames', 'Carolina Hurricanes',
               'Chicago BlackHawks', 'Colorado Avalanche', 'Columbus Blue Jackets', 'Dallas Stars', 'Detroit Red Wings',
               'Edmonton Oilers', 'Florida Panthers', 'Los Angeles Kings', 'Minnesota Wild', 'Montreal Canadiens', 'Nashville Predators',
               'New Jersey Devils', 'New York Islanders', 'New York Rangers', 'Ottawa Senators', 'Philadelphia Flyers',
               'Pittsburgh Penguins', 'St. Louis Blues', 'San Jose Sharks', 'Tampa Bay Lightning', 'Toronto Maple Leafs',
               'Vancouver Canucks', 'Vegas Golden Knights', 'Washington Capitals', 'Winnipeg Jets']

  return contracted_team in nhl_teams


def get_internationals_skater(player_page):
  international_text = """
                                                International
                                                                                            """
  intl_text = player_page.find('a', text=international_text)

  intl_games = 0
  intl_assists = 0
  intl_goals = 0

  intl_jr_games = 0
  intl_jr_assists = 0
  intl_jr_goals = 0

  if intl_text:
    intl_games = intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text
    intl_assists = intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text.strip()
    intl_goals = intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text

    intl_games = int(intl_games.strip().replace("\n", ""))
    intl_assists = int(intl_assists.strip())
    intl_goals = int(intl_goals.strip())

  # Get intl jr
  international_text = """
                                                International-Jr
                                                                                            """
  intl_jr_text = player_page.find('a', text=international_text)

  if intl_jr_text:
    intl_jr_games = intl_jr_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text
    intl_jr_assists = intl_jr_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text.strip()
    intl_jr_goals = intl_jr_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text

    intl_jr_games = int(intl_jr_games.strip().replace("\n", ""))
    intl_jr_assists = int(intl_jr_assists.strip())
    intl_jr_goals = int(intl_jr_goals.strip())

  total_games = intl_jr_games + intl_games
  total_assists = intl_assists + intl_jr_assists
  total_goals = intl_goals + intl_jr_goals

  return total_games, total_goals, total_assists


def get_internationals_goalie(player_page):
  international_text = """
                                                International
                                                                                            """
  intl_text = player_page.find('a', text=international_text)

  if intl_text is None:
    return ''

  intl_games = intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text
  return intl_games.strip()


def scrape_player_page(link):
  player_page = requests.get(link, headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'referrer': 'https://google.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Pragma': 'no-cache',
    })
  player_page = BeautifulSoup(player_page.content)

  first_name, last_name = get_name(player_page)

  league = get_league(player_page)

  dob = get_dob(player_page)

  birth_place = get_birthplace(player_page)

  primary_nation, secondary_nation, declared_nation = get_nations(player_page)

  contract_expiry, contracted_team, join_date = get_contracted_team(player_page)

  if check_nhl_team(contracted_team) and nhl_contracts == 'Skip players with NHL contracts':
    team_playing = 'skip'
  elif check_nhl_team(contracted_team) and nhl_contracts == 'Include and set NHL team to playing':
    team_playing = contracted_team
  else:
    team_playing = get_team_playing(player_page)

  position_text = """
                                        Position
                                    """
  position = player_page.find('div', text=position_text).next_element.next_element.next_element.text.strip()

  height, weight = get_height_weight(player_page)

  handedness = get_handedness(player_page)

  if not scrape_international_games:
    intl_games = ''
    intl_g = ''
    intl_a = ''
  elif position == 'G':
    intl_games = get_internationals_goalie(player_page)
    intl_g = ''
    intl_a = ''
  else:
    intl_games, intl_g, intl_a  = get_internationals_skater(player_page)

  return first_name, last_name, team_playing, league, dob, birth_place, primary_nation, secondary_nation, declared_nation, position, height, weight, handedness, contract_expiry, contracted_team, join_date, intl_games, intl_g, intl_a

## Determine position

In [7]:
def determine_position(pos, shoots):
  goaltender = 1
  left_defense = 1
  right_defense = 1
  left_wing = 1
  center = 1
  right_wing = 1

  if pos == 'C':
    center = 20
    left_wing = 12
    right_wing = 12

  elif pos == 'LW':
    center = 12
    left_wing = 20
    right_wing = 12

  elif pos == 'RW':
    center = 12
    left_wing = 12
    right_wing = 20

  elif pos == 'D':
    if random.random() < 0.8:
      if shoots == 'R':
        left_defense = 12
        right_defense = 20
      elif shoots == 'L':
        left_defense = 20
        right_defense = 12
      else:
        print("UNKNOWN HANDEDNNESS: " + shoots)
    else:
      if shoots == 'R':
        left_defense = random.randint(15, 18)
        right_defense = 20
      elif shoots == 'L':
        left_defense = 20
        right_defense = random.randint(15, 18)
      else:
        print("UNKNOWN HANDEDNNESS: " + shoots)

  elif pos == 'F':
    positions = ['C', 'LW', 'RW', 'RW/C', 'LW/C', 'C/LW', 'C/RW', 'LW/RW', 'RW/LW', 'LW/RW/C', 'RW/LW/C', 'C/LW/RW']
    position = random.choice(positions)
    if position == 'C':
      center = 20
      left_wing = 12
      right_wing = 12
    elif position == 'LW':
      center = 12
      left_wing = 20
      right_wing = 12
    elif position == 'RW':
      center = 12
      left_wing = 12
      right_wing = 20
    elif position == 'RW/C':
      center = random.randint(15, 18)
      left_wing = 12
      right_wing = 20
    elif position == 'LW/C':
      center = random.randint(15, 18)
      left_wing = 20
      right_wing = 12
    elif position == 'C/LW':
      center = 20
      left_wing = random.randint(15, 18)
      right_wing = 12
    elif position == 'C/RW':
      center = 20
      left_wing = 12
      right_wing = random.randint(15, 18)
    elif position == 'LW/RW':
      center = 12
      left_wing = 20
      right_wing = random.randint(15, 18)
    elif position == 'RW/LW':
      center = 12
      left_wing = random.randint(15, 18)
      right_wing = 20
    elif position == 'LW/RW/C':
      center = random.randint(15, 18)
      left_wing = 20
      right_wing = random.randint(15, 18)
    elif position == 'RW/LW/C':
      center = random.randint(15, 18)
      left_wing = random.randint(15, 18)
      right_wing = 20
    elif position == 'C/LW/RW':
      center = 20
      left_wing = random.randint(15, 18)
      right_wing = random.randint(15, 18)

  elif pos == 'G':
    goaltender = 20

  elif pos == 'D/F':
    if shoots == 'R':
        left_defense = 12
        right_defense = 20
    elif shoots == 'L':
      left_defense = 20
      right_defense = 12
    else:
      print("UNKNOWN HANDEDNNESS: " + shoots)
    positions = ['C', 'LW', 'RW', 'RW/C', 'LW/C', 'C/LW', 'C/RW', 'LW/RW', 'RW/LW', 'LW/RW/C', 'RW/LW/C', 'C/LW/RW']
    position = random.choice(positions)
    if position == 'C':
      center = 18
      left_wing = 12
      right_wing = 12
    elif position == 'LW':
      center = 12
      left_wing = 18
      right_wing = 12
    elif position == 'RW':
      center = 12
      left_wing = 12
      right_wing = 18
    elif position == 'RW/C':
      center = random.randint(15, 18)
      left_wing = 12
      right_wing = 18
    elif position == 'LW/C':
      center = random.randint(15, 18)
      left_wing = 18
      right_wing = 12
    elif position == 'C/LW':
      center = 18
      left_wing = random.randint(15, 18)
      right_wing = 12
    elif position == 'C/RW':
      center = 18
      left_wing = 12
      right_wing = random.randint(15, 18)
    elif position == 'LW/RW':
      center = 12
      left_wing = 18
      right_wing = random.randint(15, 18)
    elif position == 'RW/LW':
      center = 12
      left_wing = random.randint(15, 18)
      right_wing = 18
    elif position == 'LW/RW/C':
      center = random.randint(15, 18)
      left_wing = 18
      right_wing = random.randint(15, 18)
    elif position == 'RW/LW/C':
      center = random.randint(15, 18)
      left_wing = random.randint(15, 18)
      right_wing = 18
    elif position == 'C/LW/RW':
      center = 18
      left_wing = random.randint(15, 18)
      right_wing = random.randint(15, 18)

  elif pos == 'C/RW':
    center = 20
    right_wing = random.randint(15, 18)

  elif pos == 'C/LW':
    center = 20
    left_wing = random.randint(15, 18)

  elif pos == 'LW/C':
    center = random.randint(15, 18)
    left_wing = 20

  elif pos == 'LW/RW':
    left_wing = 20
    right_wing = random.randint(15, 18)

  elif pos == 'D/LW':
    if shoots == 'R':
      left_defense = 12
      right_defense = 20
    elif shoots == 'L':
      left_defense = 20
      right_defense = 12
    left_wing = random.randint(15, 18)

  elif pos == 'D/C':
    if shoots == 'R':
      left_defense = 12
      right_defense = 20
    elif shoots == 'L':
      left_defense = 20
      right_defense = 12
    center = random.randint(15, 18)

  elif pos == 'RW/LW':
    right_wing = 20
    left_wing = random.randint(15, 18)

  elif pos == 'W/C':
    if shoots == 'R':
      right_wing = 20
      left_wing = random.randint(15, 18)
    else:
      right_wing = random.randint(15, 18)
      left_wing = 20
    center = random.randint(15, 18)

  elif pos == 'RW/C':
    right_wing = 20
    center = random.randint(15, 18)

  elif pos == 'W':
    if random.random() > 0.5:
      right_wing = 20
      left_wing = random.randint(15, 18)
    else:
      right_wing = random.randint(15, 18)
      left_wing = 20

  elif pos == 'D/W':
    if random.random() > 0.5:
      right_wing = 12
      left_wing = random.randint(15, 18)
    else:
      right_wing = random.randint(15, 18)
      left_wing = 12
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20

  elif pos == 'D/LW':
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20
    left_wing = random.randint(15, 18)

  elif pos == 'C/W':
    center = 20
    left_wing = random.randint(15, 18)
    right_wing = random.randint(15, 18)

  elif pos == 'C/D':
    center = 20
    if shoots == 'R':
      right_defense = 18
    else:
      left_defense = 18

  elif pos == 'F/D':
    if shoots == 'R':
      right_defense = random.randint(15, 18)
    else:
      left_defense = random.randint(15, 18)
    if random.random() > 0.5:
      center = 20
    else:
      left_wing = 20

  elif pos == 'D/LW':
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20
    left_wing = random.randint(15, 18)

  elif pos == 'D/RW':
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20
    right_wing = random.randint(15, 18)

  elif pos == 'RW/D':
    right_wing = 20
    if shoots == 'R':
      right_defense = random.randint(15, 18)
    else:
      left_defense = random.randint(15, 18)

  elif pos == 'LW/D':
    left_wing = 20
    if shoots == 'R':
      right_defense = random.randint(15, 18)
    else:
      left_defense = random.randint(15, 18)

  elif pos == 'C/RW/D':
    right_wing = random.randint(15, 18)
    center = 20
    if shoots == 'R':
      right_defense = random.randint(15, 18)
    else:
      left_defense = random.randint(15, 18)


  else:
    positions = ['C', 'LW', 'RW', 'RW/C', 'LW/C', 'C/LW', 'C/RW', 'LW/RW', 'RW/LW', 'LW/RW/C', 'RW/LW/C', 'C/LW/RW', 'D']
    position = random.choice(positions)
    if position == 'C':
      center = 20
      left_wing = 12
      right_wing = 12
    elif position == 'LW':
      center = 12
      left_wing = 20
      right_wing = 12
    elif position == 'RW':
      center = 12
      left_wing = 12
      right_wing = 20
    elif position == 'RW/C':
      center = random.randint(15, 18)
      left_wing = 12
      right_wing = 20
    elif position == 'LW/C':
      center = 17
      left_wing = 20
      right_wing = 12
    elif position == 'C/LW':
      center = 20
      left_wing = random.randint(15, 18)
      right_wing = 12
    elif position == 'C/RW':
      center = 20
      left_wing = 12
      right_wing = random.randint(15, 18)
    elif position == 'LW/RW':
      center = 12
      left_wing = 20
      right_wing = random.randint(15, 18)
    elif position == 'RW/LW':
      center = 12
      left_wing = random.randint(15, 18)
      right_wing = 20
    elif position == 'LW/RW/C':
      center = random.randint(15, 18)
      left_wing = 20
      right_wing = random.randint(15, 18)
    elif position == 'RW/LW/C':
      center = random.randint(15, 18)
      left_wing = random.randint(15, 18)
      right_wing = 20
    elif position == 'C/LW/RW':
      center = 20
      left_wing = random.randint(15, 18)
      right_wing = random.randint(15, 18)
    elif position == 'D':
      if shoots == 'R':
        right_defense = 20
      elif shoots == "L":
        left_defense = 20
      else:
        right_defense = 20
        left_defense = random.randint(15, 18)

  return goaltender, left_defense, right_defense, left_wing, center, right_wing

In [8]:
def scrape():
  season_suffix = season[:5] + '20' + season[5:]
  for league in tqdm(leagues, desc='Leagues'):
    draft = "draft" in league
    team = "team" in league

    home_page_link = league if draft else league+'/'+season_suffix
    home_page_page = requests.get(home_page_link)
    print(home_page_link)
    home_page = BeautifulSoup(home_page_page.content)

    page_name = home_page.select(".semi-logo")[0].text.strip() if team else home_page.select(".plytitle")[0].text.strip()
    page_name = ' '.join(page_name.replace('\n', '').split()).strip()

    with open(f'/content/leagues/{page_name}.csv', 'w+', newline='', encoding='UTF-8') as csvfile:
      csvfile.write('\ufeff')
      csvwriter = csv.writer(csvfile, delimiter=',')
      top_header = 'staff,dmy,metric,,|--,NATIONALITY,,--|,,|--,,,CLUB CONTRACT,,,,--|,|--,NATION CONTRACT,,--|,|--,NATIONAL TEAM,--|,|-- NHL,--|,|--,EDIT DETAILS,--|,|--,,,STAFF ATTRIBUTES,,,,--|,|-- PLAYER ABILITY,--|,|--,PLAYER REPUTATION,--|,|--,,POSITION,,,--|,|--,ROLE,--|,,|--,JERSEY #,--|,,,,|--,,,,,,MENTAL,,,,,,,--|,|--,,,,PHYSICAL,,,,,--|,|--,,,,,,TECHNICAL,,,,,,,--|,|--,,GOALIE,,--|,|-- NON-PLAYER ABILITY,--|,|--,NON-PLAYER REPUTATION,--|,|--,,,TECHNICAL ABILITY,,,,--|,|--,,,TECHNIQUE,,,--|,,|--,BUSINESS,--|,|--,,MENTAL,,--|,|--,,NOT IMPORTED,,--|'
      second_header = 'Mode (e),First Name,Second Name,Date of Birth,Nation,SecondNation,DeclaredNation,BirthTown,Classification,JobForClub,ClubContracted,ClubPlaying,DateJoinedClub,ContractExpiresClub,EstimatedWage,EstimatedWageWeekly,EstimatedValue,JobForNation,NationContracted,DateJoinedNation,ContractExpiresNation,InternationalApps,InternationalGoals,InternationalAssists,FirstNHLContract,StanleyCupsWon,New first name,New second name,New date of birth,Adaptability,Ambition,Determination,Loyalty,Pressure,Professionalism,Sportsmanship,Temperament,CurrentAbility,PotentialAbility,HomeReputation,CurrentReputation,WorldReputation,Goaltender,LeftDefense,RightDefense,LeftWing,Center,RightWing,DefensiveRole,OffensiveRole,Role,Hand,FavouriteNumber,SquadNumber,InternationalSquadNumber,HeightCm,WeightKg,JnrPreference,Aggression,Anticipation,Bravery,Consistency,Decisions,Dirtiness,Flair,ImportantMatches,Leadership,Morale,PassTendency,Teamwork,Creativity,WorkRate,Acceleration,Agility,Balance,Fighting,Hitting,InjuryProneness,NaturalFitness,Pace,Stamina,Strength,Agitation,Checking,Deflections,Deking,Faceoffs,Movement,OneOnOnes,Passing,Pokecheck,Positioning,Slapshot,Stickhandling,Versatility,Wristshot,Blocker,Glove,Rebounds,Recovery,Reflexes,CurrentAbility,PotentialAbility,HomeReputation,CurrentReputation,WorldReputation,PreferredJob,Attacking,Directness,FreeRoles,LineMatching,PenaltyKill,Physical,PowerPlay,CoachingGoaltenders,CoachingDefensemen,CoachingForwards,CoachingTechnique,Judgement,JudgingPotential,Tactics,Physiotherapy,Business,Patience,Resources,Discipline,Interference,ManHandling,Motivating,Youngsters,League contracted,League playing,Latest career history,NHL Draft Eligible,NHL Drafted'
      csvwriter.writerow(top_header.split(','))
      csvwriter.writerow(second_header.split(','))

      if draft:

        draft_history_file = open(f'/content/leagues/{page_name}_draft_history.csv', 'w+', newline='', encoding='UTF-8')
        draft_history_file.write('\ufeff')
        draft_history_writer = csv.writer(draft_history_file, delimiter=',')
        top_header = 'draft_history, dmy'
        second_header = 'Draft, Year, Round, Overall, Club, First Name, Second Name, Date of Birth'
        draft_history_writer.writerow(top_header.split(','))
        draft_history_writer.writerow(second_header.split(','))

        player_links = []
        pick_dict = {}

        draft_round_tables = home_page.select('[data-sort-ajax-container="#drafted-players"] tbody')

        for round_number, table in enumerate(draft_round_tables):
          for pick in table.select("tr:not(.title)"):
            try:
              player_link = pick.select(".player a")[0]["href"]
            except IndexError:
              player_link = ""
            player_links.append(player_link)
            pick_number = pick.select(".overall.sorted")[0].text.replace("#","")
            picking_team = pick.select(".team")[0].text

            pick_dict[player_link] = {
                "round" : round_number,
                "overall" : pick_number,
                "club" : picking_team
            }


      elif team:
        player_links = []
        player_numbers = {}

        for player in home_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
          try:
            player_link = player.select('.txt-blue a[href]')[0]
            player_links.append(player_link['href'])
            player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
            player_number = player.select('.jersey')[0].text.split("#")[1].strip()
            player_numbers[player_name] = player_number
          except IndexError as e:
            continue

      else:
        player_links = []
        player_numbers = {}
        # team_links = set([team['href'] for team in home_page.select('table.standings.table-sortable > tbody > tr > .team > a')])
        # if not team_links:
        team_links = set([(team['href']+'/'+season_suffix) for team in home_page.select('.inner-rtl .leg-home-inner .list-as-columns > .column-4 > li > a')])
        for team_link in team_links:
          team_page = requests.get(team_link)
          team_page = BeautifulSoup(team_page.content)
          for player in team_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
            try:
              player_link = player.select('.txt-blue a[href]')[0]
              player_links.append(player_link['href'])
              player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
              player_number = player.select('.jersey')[0].text.split("#")[1].strip()
              player_numbers[player_name] = player_number
            except IndexError as e:
              continue

      for link in tqdm(player_links, desc='Players', leave=False):
        if len(link) > 0:
          try:
            first_name, last_name, team, league, dob, birth_place, primary_nation, secondary_nation, declared_nation, position, height, weight, shoots, contract_expiry, contracted_team, join_date, intl_games, intl_g, intl_a = scrape_player_page(link)

            # Ignore players on tryouts, fix contracted and playing team names for junior players
            if contracted_team == "":
              contracted_team = team
            elif contracted_team == "try-out":
              continue

            if team == 'skip':
              continue

            goaltender, left_defense, right_defense, left_wing, center, right_wing = determine_position(position, shoots)

            if goaltender==1 and left_defense==1 and right_defense==1 and left_wing==1 and center==1 and right_wing==1:
              print(f'{link} incomplete position information')
              continue

            try:
              player_number = player_numbers[first_name + " " + last_name]
            except:
              print(f"No player number for {first_name} {last_name}")
              player_number = ""

            if 'missing' not in dob or not skip_players_with_blank_dobs:
              if 'missing' in dob:
                dob = dob[:9]
              csvwriter.writerow(['',first_name, last_name, dob, primary_nation, secondary_nation, declared_nation, birth_place, 'Player', 'Player', contracted_team, team, join_date, contract_expiry, '', '', '', '', '', '', '', intl_games, intl_g, intl_a, ''] + ['']*17 + [goaltender, left_defense, right_defense, left_wing, center, right_wing, '', '', '', 'Left' if shoots == 'L' else 'Right',  '', player_number, '', height, weight] + [''] * 73 + [league, league] + ["", "", "",])

            # Write to draft sheet
            if draft:
              pick_info = pick_dict[link]
              round = pick_info["round"]
              overall = pick_info["overall"].strip()
              club = pick_info["club"].strip()
              draft_history_writer.writerow([page_name, home_page_link[-4:], round, overall, club, first_name, last_name, dob])


          except Exception as e:
            if "team-captaincy" not in link and "nation?total" not in link and 'apple-touch-icon' not in link and 'player_page.find' not in link and show_error_links:
              traceback.print_exc()
              print(f'Missing player information for: {link}')
          time.sleep(random.random() * 5)

    if draft:
      draft_history_file.close()


# Output

You should see three progress bars: one showing the progress through the leagues you want to scrape, one showing progress through all of the teams for a given league, and one showing progress through all of the players for a given team.

Some players (often in low-level or obscure leagues) will be missing information such as shooting hand, height, weight, or full date of birth. If you set show_error_links to be True in the first cell, and if the scraper comes across such a player, it will print out a small error message stating "Missing player information for: " followed by a link to the player's EP page. The scraper will still include the player in the output CSV files, although some fields for that player will be empty.

To download the .zip, can click the folder icon on the bar to the left of the screen and right-click -> download file the file 'leagues.zip'.

In [17]:
scrape()

Leagues:   0%|          | 0/1 [00:00<?, ?it/s]

https://www.eliteprospects.com/team/803/univ.-of-michigan/2023-2024


Players:   0%|          | 0/18 [00:00<?, ?it/s]

  dob_text = player_page.find('div', text=dob_search_text).next_element.next_element.next_element.text.strip()
  birth_place_text = player_page.find('div', text=place_of_birth_text).next_element.next_element.next_element.text.strip()
  nation_text = "".join(player_page.find('div', text=nation_text).next_element.next_element.next_element.text.split()).split('/')
  position = player_page.find('div', text=position_text).next_element.next_element.next_element.text.strip()
  heights = player_page.find('div', text=height_text).next_element.next_element.next_element.text.split('/')
  weights = player_page.find('div', text=weight_text).next_element.next_element.next_element.text.split('/')
  shoots = player_page.find('div', text=shoots_text).next_element.next_element.next_element.text.strip()
  shoots = player_page.find('div', text=catches_text).next_element.next_element.next_element.text.strip()
  intl_text = player_page.find('a', text=international_text)


No player number for Jacob Barczewski


  intl_text = player_page.find('a', text=international_text)
  intl_jr_text = player_page.find('a', text=international_text)


No player number for Tyler Duke
No player number for Joshua Orrico
No player number for Marshall Warren
No player number for Jack Willson
No player number for Michael Burchill
No player number for Charlie Cerrato
No player number for Joshua Eernisse
No player number for Nicholas Moldenhauer
No player number for Brian Nicholas
No player number for Chase Pletzke
No player number for Tanner Rowe
No player number for Garrett Schifsky


In [None]:
time.sleep(5)
!zip leagues.zip -r '/content/leagues/'

if use_google_drive:
  !mv leagues.zip /content/drive/MyDrive/leagues.zip

# Testing code, no need to look here

In [None]:
player_page_link = 'https://www.eliteprospects.com/player/560848/marco-kasper'
player_page = requests.get(player_page_link, headers = {
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
      'referrer': 'https://google.com',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate, br',
      'Accept-Language': 'en-US,en;q=0.9',
      'Pragma': 'no-cache',
  })
player_page = BeautifulSoup(player_page.content)

In [None]:
international_text = """
                                                International
                                                                                            """
intl_text = player_page.find('a', text=international_text)

intl_games = intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text
intl_assists = intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text
intl_goals = intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text

In [None]:
intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text.strip()

In [None]:
intl_text.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element.text

In [None]:
league = 'https://www.eliteprospects.com/team/406/gks-tychy'
draft = "draft" in league
team = "team" in league

home_page_link = league if draft or team else league+'/'+season_suffix
home_page_page = requests.get(home_page_link)
print(home_page_link)
home_page = BeautifulSoup(home_page_page.content)

page_name = home_page.select(".semi-logo")[0].text.strip() if team else home_page.select(".plytitle")[0].text.strip()

In [None]:
for t in player_page.select(".transfer:not(.loan):not(up-down)"):
  if not t.select("[title=Try-out]"):
    transfer=t
    break
contracted_team = transfer.select(".to > a")[0]['href']
contracted_team = get_full_team_name(contracted_team)
join_date_text = transfer.select(".date")[0].text.strip()

In [None]:
join_date_text

In [None]:
team_link

In [None]:
contract_expiry_prefix[:-4]