<a href="https://colab.research.google.com/github/colinrsmall/ehm_roster_tools/blob/master/EP_Career_History_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instructions:

To add leagues to the list, copy and paste an entry in the following list and replace the league's name and EliteProspects link with the name and link of the league you want to scrape. Make sure that all entries except the last end with a comma (as you can see with the first entry). The name you choose for the entry only influences the name of the output file. You can get a league's URL by going to the league's homepage on EP and copying the URL for that page from your browser.

To change which season you're scraping for, change the season string following the list of leagues. The string should be of the format 'YYYY-YYYY' (such as '2019-2020' or '2017-2018'). This will the season for which the scraper gets players. For example, setting this to 2020-21 will scrape all players that are contracted in the given league for that year.

Change latest_season to the season you want stop scraping after. For example, if you want to get a player's history up to and including the 2007-08 season, set this to 2007-08.

If you want the scraper to print out links for players who are missing information on their EP page, change show_error_links to True.

To run the scraper, click runtime -> run all.

In [16]:
leagues = "https://www.eliteprospects.com/team/29511/cranbrook-bucks" #@param {type:"string"}
leagues = leagues.split(',')

season = "2020-21" #@param {type:"string"}
latest_season = "2021-22" #@param {type:"string"}
show_error_links = True #@param {type:"boolean"}
use_google_drive = False #@param {type:"boolean"}

In [17]:
if use_google_drive:
  from google.colab import drive
  drive.mount('/content/drive')
  drive_mounted = True

# Expand this if you want to look at the code (optional)

In [18]:
!mkdir '/content/leagues/'

mkdir: cannot create directory ‘/content/leagues/’: File exists


In [19]:
!pip install unidecode



In [20]:
import requests, random, csv, traceback, time, urllib.request
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime
from google.colab import files
import unidecode

In [21]:
def get_name(player_page):
  name = name = player_page.find('div', class_='ep-entity-header__name').text.strip().split(' ')
  first_name = name[0]
  last_name = ' '.join(name[1:]).split('\n')[0].strip()
  first_name = unidecode.unidecode(first_name)
  last_name = unidecode.unidecode(last_name)
  return first_name, last_name


def get_dob(player_page):
  dob_search_text = """
                                        Date of Birth
                                    """
  try:
    dob_text = player_page.find('div', text=dob_search_text).next_element.next_element.next_element.text.strip()
    dob = datetime.strptime(dob_text, '%b %d, %Y').strftime('%-d.%-m.%Y')
  except Exception as e:
    try:
      dob = datetime.strptime(dob_text, '%Y').strftime('1.1.%Y')
    except Exception as e:
      dob = ""
      print(f'Missing dob information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return dob

def scrape_history(player_page):
  season_text = ""
  seasons = []
  for season in player_page.select(".player-statistics.regular.postseason")[0].select('tr[class*="team-continent-"]'):
      p_gp = "0"
      p_g = "0"
      p_a = "0"
      p_pim = "0"
      p_pm = "0"

      if season.select(".season.sorted")[0].text.strip() != "":
        season_text = season.select(".season.sorted")[0].text.strip()
      
      if season_text == latest_season:
        break

      if len(season.select("fa.fa-loan")) > 0:
        loan = "y"
      else:
        loan = "n"

      club = season.select(".team")[0].text.strip()
      if '“A”' in club or '“C”' in club:
        club = club[:-5]
      club = unidecode.unidecode(club)

      competition = season.select(".league")[0].text.strip()
      r_gp = season.select(".regular.gp")[0].text.strip().replace("-", "0")
      r_g = season.select(".regular.g")[0].text.strip().replace("-", "0")
      r_a = season.select(".regular.a")[0].text.strip().replace("-", "0")
      r_pim = season.select(".regular.pim")[0].text.strip().replace("-", "0")
      r_pm = season.select(".regular.pm")[0].text.strip().replace("-", "0")

      playoffs = None
      if season.select(".postseason.gp")[0].text.strip() != "-" and season.select(".postseason")[0].text.strip() == "Playoffs":
        p_gp = season.select(".postseason.gp")[0].text.strip().replace("-", "0")
        p_g = season.select(".postseason.g")[0].text.strip().replace("-", "0")
        p_a = season.select(".postseason.a")[0].text.strip().replace("-", "0")
        p_pim = season.select(".postseason.pim")[0].text.strip().replace("-", "0")
        p_pm = season.select(".postseason.pm")[0].text.strip().replace("-", "0")
        playoffs = [loan, "y", season_text, club, competition, p_gp, p_g, p_a, p_pim, p_pm, "", "", "", "", "", "", ""]

      regular_season = [loan, "n", season_text, club, competition, r_gp, r_g, r_a, r_pim, r_pm, "", "", "", "", "", "", ""]
      seasons.append((regular_season, playoffs))

  return seasons


def scrape_history_goalie(player_page):
  season_text = ""
  seasons = []
  for season in player_page.select(".player-statistics.regular.postseason")[0].select('tr[class*="team-continent-"]'):
      p_gp = ""

      if season.select(".season.sorted")[0].text.strip() != "":
        season_text = season.select(".season.sorted")[0].text.strip()
      
      if season_text == latest_season:
        break

      if len(season.select("fa.fa-loan")) > 0:
        loan = "y"
      else:
        loan = "n"

      club = season.select(".team")[0].text.strip()
      if '“A”' in club or '“C”' in club:
        club = club[:-5]
      club = unidecode.unidecode(club)

      competition = season.select(".league")[0].text.strip()
      r_gp = season.select(".regular.gp")[0].text.strip().replace("-", "0")
      

      playoffs = None
      if season.select(".postseason.gp")[0].text.strip() != "-" and season.select(".postseason")[0].text.strip() == "Playoffs":
        p_gp = season.select(".postseason.gp")[0].text.strip().replace("-", "0")

      regular_season = [loan, "n", season_text, club, competition, r_gp, "", "", "", "", "", "", "", "", "", "", ""]
      seasons.append((regular_season, playoffs))

  return seasons


def scrape_player_page(link, csvwr):
  player_page = requests.get(link, headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'referrer': 'https://google.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Pragma': 'no-cache',
    })
  
  player_page = BeautifulSoup(player_page.content)
  first_name, last_name = get_name(player_page)
  dob = get_dob(player_page)
  position_text = """
                                        Position
                                    """
  position = player_page.find('div', text=position_text).next_element.next_element.next_element.text.strip()

  if position == "G":
    seasons = scrape_history_goalie(player_page)
  else:
    seasons = scrape_history(player_page)
    
  for (regular_season, playoffs) in seasons:
    csvwr.writerow([first_name, last_name, dob] + regular_season)
    if playoffs is not None:
      csvwr.writerow([first_name, last_name, dob] + playoffs)

In [22]:
def scrape():
  season_suffix = season[:5] + '20' + season[5:]
  for league in tqdm(leagues, desc='Leagues'):
    draft = "draft" in league
    team = "team" in league

    home_page_link = league if draft or team else league+'/'+season_suffix
    home_page_page = requests.get(home_page_link)
    print(home_page_link)
    home_page = BeautifulSoup(home_page_page.content)

    page_name = home_page.select(".semi-logo")[0].text.strip() if team else home_page.select(".plytitle")[0].text.strip()
    page_name = ' '.join(page_name.replace('\n', '').split()).strip()

    with open(f'/content/leagues/{page_name}.csv', 'w+', newline='', encoding='UTF-8') as csvfile:
      csvfile.write('\ufeff')
      csvwriter = csv.writer(csvfile, delimiter=',')
      top_header = 'staff_history, dmy'
      second_header = 'First name, Second name, Date of Birth, On loan, Playoffs, Year, Club, Competition, GP, G, A, PIM, +/-, Mins, GA, SO, W, L, T/OT, Saves, Edit Club, Edit Competition'
      csvwriter.writerow(top_header.split(','))
      csvwriter.writerow(second_header.split(','))

      if draft:
        player_links = set([player['href'] for player in home_page.select('[data-sort-ajax-container="#drafted-players"] .player a')])
      elif team:
        player_links = []
        player_numbers = {}

        for player in home_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
          try:
            player_link = player.select('.txt-blue a[href]')[0]
            player_links.append(player_link['href'])
            player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
            player_number = player.select('.jersey')[0].text.split("#")[1].strip()
            player_numbers[player_name] = player_number
          except IndexError as e:
            continue
          
      else:
        player_links = []
        player_numbers = {}
        # team_links = set([team['href'] for team in home_page.select('table.standings.table-sortable > tbody > tr > .team > a')])
        # if not team_links:
        team_links = set([(team['href']+'/'+season_suffix) for team in home_page.select('.inner-rtl .leg-home-inner .list-as-columns > .column-4 > li > a')])
        for team_link in team_links:
          team_page = requests.get(team_link)
          team_page = BeautifulSoup(team_page.content)
          for player in team_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
            try:
              player_link = player.select('.txt-blue a[href]')[0]
              player_links.append(player_link['href'])
              player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
              player_number = player.select('.jersey')[0].text.split("#")[1].strip()
              player_numbers[player_name] = player_number
            except IndexError as e:
              continue

      for link in tqdm(player_links, desc='Players', leave=False):
        try:
          scrape_player_page(link, csvwriter)  
        except Exception as e:
          if "team-captaincy" not in link and "nation?total" not in link and 'apple-touch-icon' not in link and 'player_page.find' not in link and show_error_links:
            traceback.print_exc()
            print(f'Missing player information for: {link}')

        time.sleep(random.random() * 3)

# Output

You should see three progress bars: one showing the progress through the leagues you want to scrape, one showing progress through all of the teams for a given league, and one showing progress through all of the players for a given team.

Some players (often in low-level or obscure leagues) will be missing information such as shooting hand, height, weight, or full date of birth. If you set show_error_links to be True in the first cell, and if the scraper comes across such a player, it will print out a small error message stating "Missing player information for: " followed by a link to the player's EP page. The scraper will still include the player in the output CSV files, although some fields for that player will be empty.

To download the .zip, can click the folder icon on the bar to the left of the screen and right-click -> download file the file 'leagues.zip'.

In [23]:
scrape()

HBox(children=(FloatProgress(value=0.0, description='Leagues', max=1.0, style=ProgressStyle(description_width=…

https://www.eliteprospects.com/team/29511/cranbrook-bucks


HBox(children=(FloatProgress(value=0.0, description='Players', max=26.0, style=ProgressStyle(description_width…




In [24]:
time.sleep(5)
!zip leagues.zip -r '/content/leagues/'

if use_google_drive:
  !mv leagues.zip /content/drive/MyDrive/leagues.zip

updating: content/leagues/ (stored 0%)
updating: content/leagues/Cranbrook Bucks BCHL.csv (deflated 82%)


# Testing code, no need to look here

In [25]:
def scrape():
  season_suffix = season_to_scrape[:5] + '20' + season_to_scrape[5:]
  for league in tqdm(leagues, desc='Leagues'):
    # Get draft page's HTML and parse with BeautifulSoup
    home_page_link = league[1]+'/'+season_suffix
    home_page_page = requests.get(home_page_link)
    home_page = BeautifulSoup(home_page_page.content)

    with open(f'/content/leagues/{league[0]}.csv', 'w+', newline='', encoding='UTF-8') as csvfile:
      csvfile.write('\ufeff')
      csvwriter = csv.writer(csvfile, delimiter=',')
      top_header = 'staff_history, dmy'
      second_header = 'First name, Second name, Date of Birth, On loan, Playoffs, Year, Club, Competition, GP, G, A, PIM, +/-, Mins, GA, SO, W, L, T/OT, Saves, Edit Club, Edit Competition'
      csvwriter.writerow(top_header.split(','))
      csvwriter.writerow(second_header.split(','))

      player_links = []
      team_links = set([team['href'] for team in home_page.select('table.standings.table-sortable > tbody > tr > .team > a')])
      if not team_links:
        team_links = set([(team['href']+'/'+season_suffix) for team in home_page.select('.inner-rtl .leg-home-inner .list-as-columns > .column-4 > li > a')])
      for team_link in team_links:
        team_page = requests.get(team_link)
        team_page = BeautifulSoup(team_page.content)
        players = team_page.select('[data-sort-ajax-container="#roster"] > tbody > tr .txt-blue a[href]')
        player_links += [player['href'] for player in players]

      for link in tqdm(player_links, desc='Players', leave=False):
        try:
          scrape_player_page(link, csvwriter)  
        except Exception as e:
          if "team-captaincy" not in link and "nation?total" not in link and 'apple-touch-icon' not in link and 'player_page.find' not in link and show_error_links:
            traceback.print_exc()
            print(f'Missing player information for: {link}')

        time.sleep(random.random() * 3)

In [26]:
player_page_link = 'https://www.eliteprospects.com/player/42871/carl-hudson'
player_page = requests.get(player_page_link, headers = {
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
      'referrer': 'https://google.com',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate, br',
      'Accept-Language': 'en-US,en;q=0.9',
      'Pragma': 'no-cache',
  })
player_page = BeautifulSoup(player_page.content)

In [27]:
player_page.select(".team-continent-NA")

[<tr class="team-continent-NA ">
 <td class="season sorted">
                                                                                     2001-02
                                                                                                     </td>
 <td class="team">
 <i><img src="//files.eliteprospects.com/layout/flags_s/3.png"/></i>
 <span class="txt-blue">
 <a href="https://www.eliteprospects.com/team/11930/timmins-steelers-u15-aaa/2001-2002?tab=stats"> Timmins Steelers U15 AAA </a>
 </span>
 </td>
 <td class="league"> <a href="https://www.eliteprospects.com/league/nohl-u15/stats/2001-2002"> NOHL U15 </a> </td>
 <td class="regular gp">32</td>
 <td class="regular g">10</td>
 <td class="regular a">21</td>
 <td class="regular tp">31</td>
 <td class="regular pim">47</td>
 <td class="regular pm"></td>
 <td class="separator"> | </td>
 <td class="postseason">
 <a href="https://www.eliteprospects.com/league/nohl-u15/stats/2001-2002"> </a>
 </td>
 <td class="postseason gp">
 </td

In [28]:
season_text = ""
for season in player_page.select(".player-statistics.regular.postseason")[0].select(".team-continent-EU"):
    p_gp = "0"
    p_g = "0"
    p_a = "0"
    p_pim = "0"
    p_pm = "0"

    if season.select(".season.sorted")[0].text.strip() != "":
      season_text = season.select(".season.sorted")[0].text.strip()
    
    if season_text == latest_season:
      break

    if len(season.select("fa.fa-loan")) > 0:
      loan = "y"
    else:
      loan = "n"

    club = season.select(".team")[0].text.strip()
    if '“A”' in club or '“C”' in club:
      club = club[:-5]

    competition = season.select(".league")[0].text.strip()
    r_gp = season.select(".regular.gp")[0].text.strip()
    r_g = season.select(".regular.g")[0].text.strip()
    r_a = season.select(".regular.a")[0].text.strip()
    r_pim = season.select(".regular.pim")[0].text.strip()
    r_pm = season.select(".regular.pm")[0].text.strip()

    playoffs = None
    if season.select(".postseason.gp")[0].text.strip() != "" or season.select(".postseason.gp")[0].text.strip() != "-":
      p_gp = season.select(".postseason.gp")[0].text.strip()
      p_g = season.select(".postseason.g")[0].text.strip()
      p_a = season.select(".postseason.a")[0].text.strip()
      p_pim = season.select(".postseason.pim")[0].text.strip()
      p_pm = season.select(".postseason.pm")[0].text.strip()
      playoffs = [loan, "y", season_text, club, competition, p_gp, r_g, p_a, p_pim, p_pm, 0, 0, 0, 0, 0, 0, 0]

    regular_season = [loan, "n", season_text, club, competition, r_gp, r_g, r_a, r_pim, r_pm, 0, 0, 0, 0, 0, 0, 0]

In [29]:
player_page.select(".player-statistics.regular.postseason")[0].select('tr[class*="team-continent-"]')

[<tr class="team-continent-NA ">
 <td class="season sorted">
                                                                                     2001-02
                                                                                                     </td>
 <td class="team">
 <i><img src="//files.eliteprospects.com/layout/flags_s/3.png"/></i>
 <span class="txt-blue">
 <a href="https://www.eliteprospects.com/team/11930/timmins-steelers-u15-aaa/2001-2002?tab=stats"> Timmins Steelers U15 AAA </a>
 </span>
 </td>
 <td class="league"> <a href="https://www.eliteprospects.com/league/nohl-u15/stats/2001-2002"> NOHL U15 </a> </td>
 <td class="regular gp">32</td>
 <td class="regular g">10</td>
 <td class="regular a">21</td>
 <td class="regular tp">31</td>
 <td class="regular pim">47</td>
 <td class="regular pm"></td>
 <td class="separator"> | </td>
 <td class="postseason">
 <a href="https://www.eliteprospects.com/league/nohl-u15/stats/2001-2002"> </a>
 </td>
 <td class="postseason gp">
 </td