<a href="https://colab.research.google.com/github/colinrsmall/ehm_roster_tools/blob/master/EP_Roster_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instructions:

To scrape leagues, add or remove links from the following "leagues" form field. The field should be a comma-separated string of links to either leagues or drafts from EliteProspects.

To change which season you're scraping for, change the "season" field following the list of leagues. The string should be of the format 'YYYY-YY' (such as '2019-20' or '2017-18').

The "contract_expiry_prefix" field should be set set as the day and month at which you expect players' contracts to expire in-game. For example, if you want all players scraped with this notebook to have their contracts expire on June 1st of a respective year, set the string to "1.6.XXXX".

If you want the scraper to print out links for players who are missing information on their EP page, change show_error_links to True.

If include_nhl_signed is ticked, players who are signed to an NHL team but are on loan (as listed in EP) will be listed as playing for the team that they are on loan to (and thus will not be listed as playing for their signed NHL team).

In [None]:
leagues = "https://www.eliteprospects.com/league/france" #@param {type:"string"}
leagues = leagues.split(',')

season = "2019-20" #@param {type:"string"}
contract_expiry_prefix = "30.4.XXXX" #@param {type:"string"}
show_error_links = True #@param {type:"boolean"}
include_nhl_signed = True #@param {type:"boolean"}

# Expand this if you want to look at the code (optional)

In [None]:
!mkdir '/content/leagues/'
!mkdir 'content/faces/'

mkdir: cannot create directory ‘/content/leagues/’: File exists
mkdir: cannot create directory ‘content/faces/’: No such file or directory


In [None]:
import requests, random, csv, traceback, time, urllib.request
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime
from google.colab import files

In [None]:
def get_full_team_name(team_link):
  page = requests.get(team_link)
  page = BeautifulSoup(page.content)
  name = page.select(".semi-logo")[0].text.strip()
  return name.split('\n')[0]

def correct_nation_text(nation_text):
  if nation_text == "U.K.":
    return "Great Britain"
  elif nation_text == "CzechRep.":
    return "Czech Republic"
  else:
    return nation_text

def get_name(player_page):
  name = name = player_page.find('div', class_='ep-entity-header__name').text.strip().split(' ')
  first_name = name[0]
  last_name = ' '.join(name[1:]).split('\n')[0].strip()
  return first_name, last_name


def find_season(tag):
  return season in tag.text and tag.has_attr('class') and tag['class'] == ['season', 'sorted']


def get_league(player_page):
  season_text = f"""
                                                                                    {season}
                                                                                                    """
  try:
    league = player_page.find(find_season).next_sibling.next_sibling.next_sibling.next_sibling.text.strip()
  except Exception as e:
    league = ""
    print(f'Player not found in league: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return league


def get_team_playing(player_page):
  season_text = f"""
                                                                                    {season}
                                                                                                    """
  try:
    team = player_page.find(find_season).next_element.next_element.next_element.text.strip()
    if '“A”' in team or '“C”' in team:
      team = team[:-4]
  except Exception as e:
    try:
      team = player_page.find(find_season).next_element.next_element.next_element.next_element.next_element.text.strip()
      if '“A”' in team or '“C”' in team:
        team = team[:-4]
    except Exception as e:
      team = ""
      print(f'Missing team information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return team


def get_dob(player_page):
  dob_search_text = """
                                    Date of Birth
                                """
  try:
    dob_text = player_page.find('div', text=dob_search_text).next_element.next_element.next_element.text.strip()
    dob = datetime.strptime(dob_text, '%b %d, %Y').strftime('%-d.%-m.%Y')
  except Exception as e:
    try:
      dob = datetime.strptime(dob_text, '%Y').strftime('1.1.%Y')
    except Exception as e:
      dob = ""
      print(f'Missing dob information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return dob


def get_birthplace(player_page):
  place_of_birth_text = """
                                    Place of Birth
                                """
  birth_place_text = player_page.find('div', text=place_of_birth_text).next_element.next_element.next_element.text.strip()
  birth_place = birth_place_text.replace(', ', ':').lower()
  return birth_place


def get_nations(player_page):
  nation_text = """
                                    Nation
                                """
  nation_text = "".join(player_page.find('div', text=nation_text).next_element.next_element.next_element.text.split()).split('/')
  if len(nation_text) > 1:
    primary_nation = correct_nation_text(nation_text[0])
    secondary_nation = correct_nation_text(nation_text[1])
    declared_nation = 'Primary Nationality'
  else:
    primary_nation = correct_nation_text(nation_text[0])
    secondary_nation = '[None]'
    declared_nation = '[None]'
  return primary_nation, secondary_nation, declared_nation


def get_contracted_team(player_page):
  contract_text = """
                                    Contract
                                """
  try:
    contract_expiry_text = player_page.find('div', text=contract_text).next_element.next_element.next_element.text.strip()
    if "+" in contract_expiry_text:
      plus_year = contract_expiry_text.split("+")[1]
      contract_expiry_text = contract_expiry_text.split("+")[0]
      contract_expiry = contract_expiry_prefix[:-4] + "20" + str(int(contract_expiry_text.split("/")[1])+int(plus_year))
    else:
      contract_expiry = contract_expiry_prefix[:-4] + "20" + contract_expiry_text.split("/")[1]
  except Exception:
    contract_expiry = '1.2.1900'
    contracted_team = ""
  
  try:
    contracted_team = player_page.select(".transfer.confirmed > .to > a")[0]['href']
    contracted_team = get_full_team_name(contracted_team)
    join_date_text = player_page.select(".transfer.confirmed > .date")[0].text.strip()
    try:
      join_date = datetime.strptime(join_date_text, '%m/%d/%Y').strftime('%d.%m.%Y')
    except ValueError:
      join_date = datetime.strptime(join_date_text, '%m-%d-%Y').strftime('%d.%m.%Y')
  except Exception:
    contracted_team = ""
    join_date = ""

  return contract_expiry, contracted_team, join_date


def get_height_weight(player_page):
  height_text = """
                                    Height
                                """
  weight_text = """
                                    Weight
                                """
  try:
    heights = player_page.find('div', text=height_text).next_element.next_element.next_element.text.split('/')
    height = heights[1 if "cm" in heights[1] else 0].split(' cm')[0].strip()
  except Exception as e:
    height = ""
    print(f'Missing height information: {get_name(player_page)[0]} {get_name(player_page)[1]}')

  try:
    weights = player_page.find('div', text=weight_text).next_element.next_element.next_element.text.split('/')
    weight = weights[1 if " kg" in weights[1] else 0].split(' kg')[0].strip()
  except Exception as e:
    weight = ""
    print(f'Missing weight information: {get_name(player_page)[0]} {get_name(player_page)[1]}')
  return height, weight


def get_handedness(player_page):
  shoots_text = """
                                    Shoots
                                """
  catches_text = """
                                    Catches
                                """
  try:
    shoots = player_page.find('div', text=shoots_text).next_element.next_element.next_element.text.strip()
  except Exception:
    shoots = player_page.find('div', text=catches_text).next_element.next_element.next_element.text.strip()
  return shoots


def check_nhl_team(contracted_team):
  nhl_teams = ['Anaheim Ducks', 'Arizona Coyotes', 'Boston Bruins', 'Buffalo Sabres', 'Calgary Flames', 'Carolina Hurricanes', 
               'Chicago BlackHawks', 'Colorado Avalanche', 'Columbus Blue Jackets', 'Dallas Stars', 'Detroit Red Wings', 
               'Edmonton Oilers', 'Florida Panthers', 'Los Angeles Kings', 'Minnesota Wild', 'Montreal Canadiens', 'Nashville Predators', 
               'New Jersey Devils', 'New York Islanders', 'New York Rangers', 'Ottawa Senators', 'Philadelphia Flyers', 
               'Pittsburgh Penguins', 'St. Louis Blues', 'San Jose Sharks', 'Tampa Bay Lightning', 'Toronto Maple Leafs', 
               'Vancouver Canucks', 'Vegas Golden Knights', 'Washington Capitals', 'Winnipeg Jets']

  return contracted_team in nhl_teams

def scrape_player_page(link):
  player_page = requests.get(link, headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'referrer': 'https://google.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Pragma': 'no-cache',
    })
  player_page = BeautifulSoup(player_page.content)

  first_name, last_name = get_name(player_page)

  league = get_league(player_page)

  dob = get_dob(player_page)

  birth_place = get_birthplace(player_page)

  primary_nation, secondary_nation, declared_nation = get_nations(player_page)

  contract_expiry, contracted_team, join_date = get_contracted_team(player_page)

  if check_nhl_team(contracted_team) and not include_nhl_signed:
    team_playing = contracted_team
  else:
    team_playing = get_team_playing(player_page)

  position_text = """
                                    Position
                                """
  position = player_page.find('div', text=position_text).next_element.next_element.next_element.text.strip()
  
  height, weight = get_height_weight(player_page)

  handedness = get_handedness(player_page)
  
  return first_name, last_name, team_playing, league, dob, birth_place, primary_nation, secondary_nation, declared_nation, position, height, weight, handedness, contract_expiry, contracted_team, join_date

In [None]:
def determine_position(pos, shoots):
  goaltender = 1
  left_defense = 1
  right_defense = 1
  left_wing = 1
  center = 1
  right_wing = 1

  if pos == 'C':
    center = 20
    left_wing = 12
    right_wing = 12

  elif pos == 'LW':
    center = 12
    left_wing = 20
    right_wing = 12

  elif pos == 'RW':
    center = 12
    left_wing = 12
    right_wing = 20

  elif pos == 'D':
    if random.random() < 0.8:
      if shoots == 'R':
        left_defense = 12
        right_defense = 20
      elif shoots == 'L':
        left_defense = 20
        right_defense = 12
      else:
        print("UNKNOWN HANDEDNNESS: " + shoots)
    else:
      if shoots == 'R':
        left_defense = 17
        right_defense = 20
      elif shoots == 'L':
        left_defense = 20
        right_defense = 17
      else:
        print("UNKNOWN HANDEDNNESS: " + shoots)

  elif pos == 'F':
    positions = ['C', 'LW', 'RW', 'RW/C', 'LW/C', 'C/LW', 'C/RW', 'LW/RW', 'RW/LW', 'LW/RW/C', 'RW/LW/C', 'C/LW/RW']
    position = random.choice(positions)
    if position == 'C':
      center = 20
      left_wing = 12
      right_wing = 12
    elif position == 'LW':
      center = 12
      left_wing = 20
      right_wing = 12
    elif position == 'RW':
      center = 12
      left_wing = 12
      right_wing = 20
    elif position == 'RW/C':
      center = 17
      left_wing = 12
      right_wing = 20
    elif position == 'LW/C':
      center = 17
      left_wing = 20
      right_wing = 12
    elif position == 'C/LW':
      center = 20
      left_wing = 17
      right_wing = 12
    elif position == 'C/RW':
      center = 20
      left_wing = 12
      right_wing = 17
    elif position == 'LW/RW':
      center = 12
      left_wing = 20
      right_wing = 17
    elif position == 'RW/LW':
      center = 12
      left_wing = 17
      right_wing = 20
    elif position == 'LW/RW/C':
      center = 17
      left_wing = 20
      right_wing = 17
    elif position == 'RW/LW/C':
      center = 17
      left_wing = 17
      right_wing = 20
    elif position == 'C/LW/RW':
      center = 20
      left_wing = 17
      right_wing = 17
  
  elif pos == 'G':
    goaltender = 20

  elif pos == 'D/F':
    if shoots == 'R':
        left_defense = 12
        right_defense = 20
    elif shoots == 'L':
      left_defense = 20
      right_defense = 12
    else:
      print("UNKNOWN HANDEDNNESS: " + shoots)
    positions = ['C', 'LW', 'RW', 'RW/C', 'LW/C', 'C/LW', 'C/RW', 'LW/RW', 'RW/LW', 'LW/RW/C', 'RW/LW/C', 'C/LW/RW']
    position = random.choice(positions)
    if position == 'C':
      center = 20
      left_wing = 12
      right_wing = 12
    elif position == 'LW':
      center = 12
      left_wing = 20
      right_wing = 12
    elif position == 'RW':
      center = 12
      left_wing = 12
      right_wing = 20
    elif position == 'RW/C':
      center = 17
      left_wing = 12
      right_wing = 20
    elif position == 'LW/C':
      center = 17
      left_wing = 20
      right_wing = 12
    elif position == 'C/LW':
      center = 20
      left_wing = 17
      right_wing = 12
    elif position == 'C/RW':
      center = 20
      left_wing = 12
      right_wing = 17
    elif position == 'LW/RW':
      center = 12
      left_wing = 20
      right_wing = 17
    elif position == 'RW/LW':
      center = 12
      left_wing = 17
      right_wing = 20
    elif position == 'LW/RW/C':
      center = 17
      left_wing = 20
      right_wing = 17
    elif position == 'RW/LW/C':
      center = 17
      left_wing = 17
      right_wing = 20
    elif position == 'C/LW/RW':
      center = 20
      left_wing = 17
      right_wing = 17

  elif pos == 'C/RW':
    center = 20
    right_wing = 17

  elif pos == 'C/LW':
    center = 20
    left_wing = 17

  elif pos == 'LW/C':
    center = 17
    left_wing = 20

  elif pos == 'LW/RW':
    left_wing = 20
    right_wing = 17

  elif pos == 'D/LW':
    if shoots == 'R':
      left_defense = 12
      right_defense = 20
    elif shoots == 'L':
      left_defense = 20
      right_defense = 12
    left_wing = 17
  
  elif pos == 'D/C':
    if shoots == 'R':
      left_defense = 12
      right_defense = 20
    elif shoots == 'L':
      left_defense = 20
      right_defense = 12
    center = 17

  elif pos == 'RW/LW':
    right_wing = 20
    left_wing = 17

  elif pos == 'W/C':
    if shoots == 'R':
      right_wing = 20
      left_wing = 17
    else:
      right_wing = 17
      left_wing = 20
    center = 17

  elif pos == 'RW/C':
    right_wing = 20
    center = 17

  elif pos == 'W':
    if random.random() > 0.5:
      right_wing = 20
      left_wing = 17
    else:
      right_wing = 17
      left_wing = 20

  elif pos == 'D/W':
    if random.random() > 0.5:
      right_wing = 20
      left_wing = 17
    else:
      right_wing = 17
      left_wing = 20
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20

  elif pos == 'D/LW':
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20
    left_wing = 17

  elif pos == 'C/W':
    center = 20
    left_wing = 17
    right_wing = 17

  elif pos == 'C/D':
    center = 20
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20
    
  elif pos == 'F/D':
    if shoots == 'R':
      right_defense = 17
    else:
      left_defense = 17
    if random.random() > 0.5:
      center = 20
    else:
      left_wing = 20

  elif pos == 'D/LW':
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20
    left_wing = 17

  elif pos == 'D/RW':
    if shoots == 'R':
      right_defense = 20
    else:
      left_defense = 20
    right_wing = 17

  elif pos == 'RW/D':
    right_wing = 20
    if shoots == 'R':
      right_defense = 17
    else:
      left_defense = 17

  elif pos == 'LW/D':
    left_wing = 20
    if shoots == 'R':
      right_defense = 17
    else:
      left_defense = 17

  elif pos == 'C/RW/D':
    right_wing = 17
    center = 20
    if shoots == 'R':
      right_defense = 17
    else:
      left_defense = 17
  

  else:
    positions = ['C', 'LW', 'RW', 'RW/C', 'LW/C', 'C/LW', 'C/RW', 'LW/RW', 'RW/LW', 'LW/RW/C', 'RW/LW/C', 'C/LW/RW', 'D']
    position = random.choice(positions)
    if position == 'C':
      center = 20
      left_wing = 12
      right_wing = 12
    elif position == 'LW':
      center = 12
      left_wing = 20
      right_wing = 12
    elif position == 'RW':
      center = 12
      left_wing = 12
      right_wing = 20
    elif position == 'RW/C':
      center = 17
      left_wing = 12
      right_wing = 20
    elif position == 'LW/C':
      center = 17
      left_wing = 20
      right_wing = 12
    elif position == 'C/LW':
      center = 20
      left_wing = 17
      right_wing = 12
    elif position == 'C/RW':
      center = 20
      left_wing = 12
      right_wing = 17
    elif position == 'LW/RW':
      center = 12
      left_wing = 20
      right_wing = 17
    elif position == 'RW/LW':
      center = 12
      left_wing = 17
      right_wing = 20
    elif position == 'LW/RW/C':
      center = 17
      left_wing = 20
      right_wing = 17
    elif position == 'RW/LW/C':
      center = 17
      left_wing = 17
      right_wing = 20
    elif position == 'C/LW/RW':
      center = 20
      left_wing = 17
      right_wing = 17
    elif position == 'D':
      if shoots == 'R':
        right_defense = 20
      elif shoots == "L":
        left_defense = 20
      else:
        right_defense = 20
        left_defense = 17

  return goaltender, left_defense, right_defense, left_wing, center, right_wing

In [None]:
def scrape():
  season_suffix = season[:5] + '20' + season[5:]
  for league in tqdm(leagues, desc='Leagues'):
    draft = "draft" in league

    home_page_link = league if draft else league+'/'+season_suffix
    home_page_page = requests.get(home_page_link)
    home_page = BeautifulSoup(home_page_page.content)

    page_name = home_page.select(".plytitle")[0].text.strip()

    with open(f'/content/leagues/{page_name}.csv', 'w+', newline='', encoding='UTF-8') as csvfile:
      csvfile.write('\ufeff')
      csvwriter = csv.writer(csvfile, delimiter=',')
      top_header = 'staff,dmy,metric,,|--,NATIONALITY,,--|,,|--,,,CLUB CONTRACT,,,,--|,|--,NATION CONTRACT,,--|,|--,NATIONAL TEAM,--|,|-- NHL,--|,|--,EDIT DETAILS,--|,|--,,,STAFF ATTRIBUTES,,,,--|,|-- PLAYER ABILITY,--|,|--,PLAYER REPUTATION,--|,|--,,POSITION,,,--|,|--,ROLE,--|,,|--,JERSEY #,--|,,,,|--,,,,,,MENTAL,,,,,,,--|,|--,,,,PHYSICAL,,,,,--|,|--,,,,,,TECHNICAL,,,,,,,--|,|--,,GOALIE,,--|,|-- NON-PLAYER ABILITY,--|,|--,NON-PLAYER REPUTATION,--|,|--,,,TECHNICAL ABILITY,,,,--|,|--,,,TECHNIQUE,,,--|,,|--,BUSINESS,--|,|--,,MENTAL,,--|,|--,,NOT IMPORTED,,--|'
      second_header = 'Mode (e),First Name,Second Name,Date of Birth,Nation,SecondNation,DeclaredNation,BirthTown,Classification,JobForClub,ClubContracted,ClubPlaying,DateJoinedClub,ContractExpiresClub,EstimatedWage,EstimatedWageWeekly,EstimatedValue,JobForNation,NationContracted,DateJoinedNation,ContractExpiresNation,InternationalApps,InternationalGoals,InternationalAssists,FirstNHLContract,StanleyCupsWon,New first name,New second name,New date of birth,Adaptability,Ambition,Determination,Loyalty,Pressure,Professionalism,Sportsmanship,Temperament,CurrentAbility,PotentialAbility,HomeReputation,CurrentReputation,WorldReputation,Goaltender,LeftDefense,RightDefense,LeftWing,Center,RightWing,DefensiveRole,OffensiveRole,Role,Hand,FavouriteNumber,SquadNumber,InternationalSquadNumber,HeightCm,WeightKg,JnrPreference,Aggression,Anticipation,Bravery,Consistency,Decisions,Dirtiness,Flair,ImportantMatches,Leadership,Morale,PassTendency,Teamwork,Creativity,WorkRate,Acceleration,Agility,Balance,Fighting,Hitting,InjuryProneness,NaturalFitness,Pace,Stamina,Strength,Agitation,Checking,Deflections,Deking,Faceoffs,Movement,OneOnOnes,Passing,Pokecheck,Positioning,Slapshot,Stickhandling,Versatility,Wristshot,Blocker,Glove,Rebounds,Recovery,Reflexes,CurrentAbility,PotentialAbility,HomeReputation,CurrentReputation,WorldReputation,PreferredJob,Attacking,Directness,FreeRoles,LineMatching,PenaltyKill,Physical,PowerPlay,CoachingGoaltenders,CoachingDefensemen,CoachingForwards,CoachingTechnique,Judgement,JudgingPotential,Tactics,Physiotherapy,Business,Patience,Resources,Discipline,Interference,ManHandling,Motivating,Youngsters,League contracted,League playing,Latest career history,NHL Draft Eligible,NHL Drafted,Jersey Number'
      csvwriter.writerow(top_header.split(','))
      csvwriter.writerow(second_header.split(','))

      if draft:
        player_links = set([player['href'] for player in home_page.select('[data-sort-ajax-container="#drafted-players"] .player a')])
      else:
        player_links = []
        player_numbers = {}
        team_links = set([team['href'] for team in home_page.select('table.standings.table-sortable > tbody > tr > .team > a')])
        if not team_links:
          team_links = set([(team['href']+'/'+season_suffix) for team in home_page.select('.inner-rtl .leg-home-inner .list-as-columns > .column-4 > li > a')])
        for team_link in team_links:
          team_page = requests.get(team_link)
          team_page = BeautifulSoup(team_page.content)
          for player in team_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
            try:
              player_link = player.select('.txt-blue a[href]')[0]
              player_links.append(player_link['href'])
              player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
              player_number = player.select('.jersey')[0].text.split("#")[1].strip()
              player_numbers[player_name] = player_number
            except IndexError:
              continue

      for link in tqdm(player_links, desc='Players', leave=False):
        try:
          first_name, last_name, team, league, dob, birth_place, primary_nation, secondary_nation, declared_nation, position, height, weight, shoots, contract_expiry, contracted_team, join_date = scrape_player_page(link)
          if contracted_team == "":
            contracted_team = team
          goaltender, left_defense, right_defense, left_wing, center, right_wing = determine_position(position, shoots)
          try:
            player_number = player_numbers[first_name + " " + last_name]
          except:
            player_number = ""
          csvwriter.writerow(['',first_name, last_name, dob, primary_nation, secondary_nation, declared_nation, birth_place, 'Player', 'Player', contracted_team, team, join_date, contract_expiry, '0', '0', '0', '', '', '1.2.1900', '1.2.1900', '0', '0', '0', '1.2.1900'] + ['']*17 + [goaltender, left_defense, right_defense, left_wing, center, right_wing, '', '', '', 'Left' if shoots == 'L' else 'Right',  '', '', '', height, weight] + [''] * 73 + [league, league] + ["", "", "", player_number])
        except Exception as e:
          if "team-captaincy" not in link and "nation?total" not in link and 'apple-touch-icon' not in link and 'player_page.find' not in link and show_error_links:
            traceback.print_exc()
            print(f'Missing player information for: {link}')
        time.sleep(random.random() * 5)

# Output

You should see three progress bars: one showing the progress through the leagues you want to scrape, one showing progress through all of the teams for a given league, and one showing progress through all of the players for a given team.

Some players (often in low-level or obscure leagues) will be missing information such as shooting hand, height, weight, or full date of birth. If you set show_error_links to be True in the first cell, and if the scraper comes across such a player, it will print out a small error message stating "Missing player information for: " followed by a link to the player's EP page. The scraper will still include the player in the output CSV files, although some fields for that player will be empty.

To download the .zip, can click the folder icon on the bar to the left of the screen and right-click -> download file the file 'leagues.zip'.

In [None]:
scrape()

HBox(children=(FloatProgress(value=0.0, description='Leagues', max=1.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Players', max=242.0, style=ProgressStyle(description_widt…

KeyboardInterrupt: ignored

In [None]:
time.sleep(5)
!zip leagues.zip -r '/content/leagues/'

# Testing code, no need to look here

In [None]:
player_page_link = 'https://www.eliteprospects.com/player/76333/mike-matheson'
player_page = requests.get(player_page_link, headers = {
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
      'referrer': 'https://google.com',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate, br',
      'Accept-Language': 'en-US,en;q=0.9',
      'Pragma': 'no-cache',
  })
player_page = BeautifulSoup(player_page.content)

In [None]:
get_contracted_team(player_page)

In [None]:

contract_text = """
                                  Contract
                              """
try:
  contract_expiry_text = player_page.find('div', text=contract_text).next_element.next_element.next_element.text.strip()
  if "+" in contract_expiry_text:
    plus_year = contract_expiry_text.split("+")[1]
    contract_expiry_text = contract_expiry_text.split("+")[0]
    contract_expiry = contract_expiry_prefix[:-4] + "20" + str(int(contract_expiry_text.split("/")[1])+int(plus_year))
  else:
    contract_expiry = contract_expiry_prefix[:-4] + "20" + contract_expiry_text.split("/")[1]
except Exception:
  contract_expiry = '1.2.1900'
  contracted_team = ""

contracted_team = player_page.select(".transfer.confirmed > .to > a")[0]['href']
contracted_team = get_full_team_name(contracted_team)
join_date_text = player_page.select(".transfer.confirmed > .date")[0].text.strip()
join_date = datetime.strptime(join_date_text, '%m/%d/%Y').strftime('%d.%m.%Y')

In [None]:
contracted_team

In [None]:
contracted_team = player_page.select(".transfer.confirmed > .to > a")[0]['href']
contracted_team = get_full_team_name(contracted_team)
join_date_text = player_page.select(".transfer.confirmed > .date")[0].text.strip()
join_date = datetime.strptime(join_date_text, '%Y-%m-%d').strftime('%d.%m.%Y')

In [None]:
contracted_team.split('\n')[0]

In [None]:
player_page.find('div', text=contract_text).next_element.next_element.next_element.text.strip()

In [None]:
urllib.request.urlretrieve("https://"+player_image, f"faces/{first_name}_{last_name}_{dob}.jpg")

In [None]:
get_dob(player_page)

In [None]:
contract_expiry_text = player_page.find('div', text=contract_text).next_element.next_element.next_element.text.strip()
contract_expiry = contract_expiry_prefix[:-4] + "20" + contract_expiry_text.split("/")[1]
contracted_team = player_page.select(".transfer.confirmed > .to > a")[0].text.strip()
join_date_text = player_page.select(".transfer.confirmed > .date")[0].text.strip()
join_date = datetime.strptime(join_date_text, '%m/%d/%Y').strftime('%d.%m.%Y')

In [None]:
'North Central Predators Mdgt AAA “C”'[:-4]

In [None]:
league_page = requests.get('https://www.eliteprospects.com/draft/gmhl-draft/2020')
league_page = BeautifulSoup(league_page.content)
set([player['href'] for player in league_page.select('[data-sort-ajax-container="#drafted-players"] .player a')])

In [None]:
for link in tqdm(player_links, desc='Players', leave=False):
  try:
    first_name, last_name, team, league, dob, birth_place, primary_nation, secondary_nation, declared_nation, position, height, weight, shoots, contract_expiry, contracted_team, join_date = scrape_player_page(link)
    if contracted_team == "":
      contracted_team = team
    goaltender, left_defense, right_defense, left_wing, center, right_wing = determine_position(position, shoots)
    csvwriter.writerow(['',first_name, last_name, dob, primary_nation, secondary_nation, declared_nation, birth_place, 'Player', 'Player', contracted_team, team, join_date, contract_expiry, '0', '0', '0', '', '', '1.2.1900', '1.2.1900', '0', '0', '0', '1.2.1900'] + ['']*17 + [goaltender, left_defense, right_defense, left_wing, center, right_wing, '', '', '', 'Left' if shoots == 'L' else 'Right',  '', '', '', height, weight] + [''] * 73 + [league, league])
  except Exception as e:
    if "team-captaincy" not in link and "nation?total" not in link and show_error_links:
      traceback.print_exc()
      print(f'Missing player information for: {link}')
  time.sleep(random.random() * 3)

In [None]:
league = leagues[0]
draft = "draft" in league
season_suffix = season[:5] + '20' + season[5:]
home_page_link = league if draft else league+'/'+season_suffix
home_page_page = requests.get(home_page_link)
home_page = BeautifulSoup(home_page_page.content)
player_links = []
player_numbers = {}
team_links = set([team['href'] for team in home_page.select('table.standings.table-sortable > tbody > tr > .team > a')])
if not team_links:
  team_links = set([(team['href']+'/'+season_suffix) for team in home_page.select('.inner-rtl .leg-home-inner .list-as-columns > .column-4 > li > a')])
for team_link in team_links:
  team_page = requests.get(team_link)
  team_page = BeautifulSoup(team_page.content)
  for player in team_page.select('[data-sort-ajax-container="#roster"] > tbody > tr'):
    try:
      player_link = player.select('.txt-blue a[href]')[0]
      player_links += player_link['href']
      player_name = player.select('.txt-blue')[0].text.split('(')[0].strip()
      player_number = player.select('.jersey')[0].text.split("#")[1].strip()
      player_numbers[player_name] = player_number
    except IndexError:
      continue

In [None]:
player_numbers