<a href="https://colab.research.google.com/github/colinrsmall/ehm_roster_tools/blob/master/EP_Career_History_Scraper_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instructions:

To add leagues to the list, copy and paste an entry in the following list and replace the league's name and EliteProspects link with the name and link of the league you want to scrape. Make sure that all entries except the last end with a comma (as you can see with the first entry). The name you choose for the entry only influences the name of the output file. You can get a league's URL by going to the league's homepage on EP and copying the URL for that page from your browser.

To change which season you're scraping for, change the season string following the list of leagues. The string should be of the format 'YYYY-YYYY' (such as '2019-2020' or '2017-2018'). This will the season for which the scraper gets players. For example, setting this to 2020-21 will scrape all players that are contracted in the given league for that year.

Change latest_season to the season you want stop scraping after. For example, if you want to get a player's history up to and including the 2007-08 season, set this to 2007-08.

If you want the scraper to print out links for players who are missing information on their EP page, change show_error_links to True.

To run the scraper, click runtime -> run all.

In [8]:
leagues = "https://www.eliteprospects.com/team/1580/anaheim-ducks" #@param {type:"string"}
leagues = leagues.split(',')

api_key = "" #@param {type:"string"}

season_year = "2008-09" #@param {type:"string"}
latest_season = "2008-09" #@param {type:"string"}
seasons_to_scrape_text = "" #@param {type:"string"}
show_error_links = True #@param {type:"boolean"}
use_google_drive = False #@param {type:"boolean"}
only_goalies = False #@param {type:"boolean"}

In [9]:
if use_google_drive:
  from google.colab import drive
  drive.mount('/content/drive')
  drive_mounted = True

# Expand this if you want to look at the code (optional)

In [10]:
!mkdir '/content/leagues/'

mkdir: cannot create directory ‘/content/leagues/’: File exists


In [11]:
import requests, random, csv, traceback, time, urllib.request
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime
from google.colab import files

In [18]:
def get_season_records(player_id):
  request_link = f"https://api.eliteprospects.com/v1/players/{player_id}/stats?sort=season&apiKey={api_key}"
  response = requests.get(request_link)
  if response.ok:
    json = response.json()
    return json["data"]
  else:
    response.raise_for_status()


def scrape_history(player_id, position):
  season_records = get_season_records(player_id)
  season_stats = []

  seasons_to_scrape = seasons_to_scrape_text.split(",")

  for season in season_records:
    year = season["season"]["slug"][:5] + season["season"]["slug"][7:]

    if season["team"] and (len(seasons_to_scrape) == 0 or seasons_to_scrape_text==""):
      loan = 'y' if season["contractType"] == "loan" else 'n'

      # scrape regular season
      club = season["team"]["name"]
      competition = season["league"]["name"]
      
      playoff_stats = None
      reg_season_stats = None

      if season["regularStats"]:
        gp = season["regularStats"]["GP"]

        if position != "G":
          g = season["regularStats"]["G"]
          a = season["regularStats"]["A"]
          pim = season["regularStats"]["PIM"]
          pm = season["regularStats"]["PM"]
          reg_season_stats = [loan, "n", year, club, competition, gp, g, a, pim, pm]
        else: # TODO: Scrape minutes, GA, SO, W, L, T, save records for goalies
          svp = season["regularStats"]["SVP"]
          gaa = season["regularStats"]["GAA"]
          reg_season_stats = [loan, "n", year, club, competition, gp]
      # scrape playoffs
      if season["postseasonStats"]:
        p_gp = season["postseasonStats"]["GP"]
        
        if position != "G":
          p_g = season["postseasonStats"]["G"]
          p_a = season["postseasonStats"]["A"]
          p_pim = season["postseasonStats"]["PIM"]
          p_pm = season["postseasonStats"]["PM"]
          playoff_stats = [loan, "y", year, club, competition, p_gp, p_g, p_a, p_pim, p_pm]
        else: # TODO: Scrape minutes, GA, SO, W, L, T, save records for goalies
          p_gp = season["postseasonStats"]["GP"]
          p_svp = season["postseasonStats"]["SVP"]
          p_gaa = season["postseasonStats"]["GAA"]
          playoff_stats = [loan, "y", year, club, competition, p_gp]

      if reg_season_stats:
        season_stats.append(reg_season_stats)
      
      if playoff_stats:
        season_stats.append(playoff_stats)

      if year == latest_season:
        break

  return season_stats


def get_basic_player_info(player_id):
  request_link = f"https://api.eliteprospects.com/v1/players/{player_id}?fields=firstName,lastName,position,dateOfBirth&apiKey={api_key}"
  response = requests.get(request_link)
  if response.ok:
    json = response.json()
    first_name = json["data"]["firstName"]
    last_name = json["data"]["lastName"]
    position = json["data"]["position"]
    dob_text = json["data"]["dateOfBirth"]
    dob = datetime.strptime(dob_text, '%Y-%m-%d').strftime('%-d.%-m.%Y')
    return first_name, last_name, position, dob
  else:
    response.raise_for_status()

def scrape_player_page(link, csvwr):
  player_id = link.split("https://www.eliteprospects.com/player/")[1].split('/')[0]
  first_name, last_name, position, dob = get_basic_player_info(player_id)
  season_records = scrape_history(player_id, position)

  for record in season_records:
    csvwr.writerow([first_name, last_name, dob] + record)

In [19]:
def scrape():
  season_suffix = season_year[:5] + '20' + season_year[5:]
  for league in tqdm(leagues, desc='Leagues'):
    draft = "draft" in league
    team = "team" in league

    home_page_link = league if draft else league+'/'+season_suffix
    home_page_page = requests.get(home_page_link)
    print(home_page_link)
    home_page = BeautifulSoup(home_page_page.content)

    page_name = home_page.select(".semi-logo")[0].text.strip() if team else home_page.select(".plytitle")[0].text.strip()
    page_name = ' '.join(page_name.replace('\n', '').split()).strip()

    with open(f'/content/leagues/{page_name}.csv', 'w+', newline='', encoding='UTF-8') as csvfile:
      csvfile.write('\ufeff')
      csvwriter = csv.writer(csvfile, delimiter=',')
      top_header = 'staff_history, dmy'
      second_header = 'First name, Second name, Date of Birth, On loan, Playoffs, Year, Club, Competition, GP, G, A, PIM, +/-, Mins, GA, SO, W, L, T/OT, Saves, Edit Club, Edit Competition'
      csvwriter.writerow(top_header.split(','))
      csvwriter.writerow(second_header.split(','))

      if draft:
        player_links = set([player['href'] for player in home_page.select('[data-sort-ajax-container="#drafted-players"] .player a')])
      elif team:
        player_links = []
        player_numbers = {}

        for player in home_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
          try:
            player_link = player.select('.txt-blue a[href]')[0]
            player_links.append(player_link['href'])
            player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
            player_number = player.select('.jersey')[0].text.split("#")[1].strip()
            player_numbers[player_name] = player_number
          except IndexError as e:
            continue
          
      else:
        player_links = []
        player_numbers = {}
        # team_links = set([team['href'] for team in home_page.select('table.standings.table-sortable > tbody > tr > .team > a')])
        # if not team_links:
        team_links = set([(team['href']+'/'+season_suffix) for team in home_page.select('.inner-rtl .leg-home-inner .list-as-columns > .column-4 > li > a')])
        for team_link in team_links:
          team_page = requests.get(team_link)
          team_page = BeautifulSoup(team_page.content)
          for player in team_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
            try:
              player_link = player.select('.txt-blue a[href]')[0]
              player_links.append(player_link['href'])
              player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
              player_number = player.select('.jersey')[0].text.split("#")[1].strip()
              player_numbers[player_name] = player_number
            except IndexError as e:
              continue

      for link in tqdm(player_links, desc='Players', leave=False):
        try:
          scrape_player_page(link, csvwriter)  
        except Exception as e:
          if "team-captaincy" not in link and "nation?total" not in link and 'apple-touch-icon' not in link and 'player_page.find' not in link and show_error_links:
            traceback.print_exc()
            print(f'Missing player information for: {link}')

        time.sleep(random.random() * 1)

# Output

You should see three progress bars: one showing the progress through the leagues you want to scrape, one showing progress through all of the teams for a given league, and one showing progress through all of the players for a given team.

Some players (often in low-level or obscure leagues) will be missing information such as shooting hand, height, weight, or full date of birth. If you set show_error_links to be True in the first cell, and if the scraper comes across such a player, it will print out a small error message stating "Missing player information for: " followed by a link to the player's EP page. The scraper will still include the player in the output CSV files, although some fields for that player will be empty.

To download the .zip, can click the folder icon on the bar to the left of the screen and right-click -> download file the file 'leagues.zip'.

In [20]:
scrape()

Leagues:   0%|          | 0/1 [00:00<?, ?it/s]

https://www.eliteprospects.com/team/1580/anaheim-ducks/2008-2009


Players:   0%|          | 0/39 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
time.sleep(5)
!zip leagues.zip -r '/content/leagues/'

if use_google_drive:
  !mv leagues.zip /content/drive/MyDrive/leagues.zip

# Testing code, no need to look here

In [None]:
def scrape():
  season_suffix = season_to_scrape[:5] + '20' + season_to_scrape[5:]
  for league in tqdm(leagues, desc='Leagues'):
    # Get draft page's HTML and parse with BeautifulSoup
    home_page_link = league[1]+'/'+season_suffix
    home_page_page = requests.get(home_page_link)
    home_page = BeautifulSoup(home_page_page.content)

    with open(f'/content/leagues/{league[0]}.csv', 'w+', newline='', encoding='UTF-8') as csvfile:
      csvfile.write('\ufeff')
      csvwriter = csv.writer(csvfile, delimiter=',')
      top_header = 'staff_history, dmy'
      second_header = 'First name, Second name, Date of Birth, On loan, Playoffs, Year, Club, Competition, GP, G, A, PIM, +/-, Mins, GA, SO, W, L, T/OT, Saves, Edit Club, Edit Competition'
      csvwriter.writerow(top_header.split(','))
      csvwriter.writerow(second_header.split(','))

      player_links = []
      team_links = set([team['href'] for team in home_page.select('table.standings.table-sortable > tbody > tr > .team > a')])
      if not team_links:
        team_links = set([(team['href']+'/'+season_suffix) for team in home_page.select('.inner-rtl .leg-home-inner .list-as-columns > .column-4 > li > a')])
      for team_link in team_links:
        team_page = requests.get(team_link)
        team_page = BeautifulSoup(team_page.content)
        players = team_page.select('[data-sort-ajax-container="#roster"] > tbody > tr .txt-blue a[href]')
        player_links += [player['href'] for player in players]

      for link in tqdm(player_links, desc='Players', leave=False):
        try:
          scrape_player_page(link, csvwriter)  
        except Exception as e:
          if "team-captaincy" not in link and "nation?total" not in link and 'apple-touch-icon' not in link and 'player_page.find' not in link and show_error_links:
            traceback.print_exc()
            print(f'Missing player information for: {link}')

        time.sleep(random.random() * 3)

In [None]:
player_page_link = 'https://www.eliteprospects.com/player/42871/carl-hudson'
player_page = requests.get(player_page_link, headers = {
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
      'referrer': 'https://google.com',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate, br',
      'Accept-Language': 'en-US,en;q=0.9',
      'Pragma': 'no-cache',
  })
player_page = BeautifulSoup(player_page.content)

In [None]:
player_page.select(".team-continent-NA")