<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Player_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [11]:
import string
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import bs4
import re
from datetime import date
import time

### Preliminary Feature Selection

Here, we remove statistics that are irrelevant, redundant, or perhaps unnecessary for our use

For accolades we remove 'NBA 75th Anniv. Team', 'ABA All-Time Team', 'Oscar', 'WCF MVP', 'DWTS Champ', 'ECF MVP', 'AS_MVP', 'All_Rookie', 'Most_Improved', and 'Sixth_Man'

Many tables hold data for G, GS, FG%, 3P%, 2P%, eFG%, FT% that encodes the same information so we use totals

We also do not include information such as MP, PF, TOV from the per-game and totals tables.

From the advanced table, we do not include information such as '3PAr_advanced', 'FTr_advanced', 'ORB%_advanced', 'DRB%_advanced', 'TRB%_advanced', 'AST%_advanced', 'STL%_advanced', 'BLK%_advanced', 'TOV%_advanced', and 
'USG%_advanced'

### Define Functions

In [12]:
def getEligible(table):
  """

  Determines if a player is eligible to be voted for the Hall of Fame based on
  the last year they played in the league


  Args:
    table: Table holding player statistics for final year to be extracted from

  
  Returns:
    [bool]: Returns a boolean indicating whether a player is eligible for the
      Hall of Fame

  """
  #TODO: Ensure year difference is consistent with reality

  years_in_html = table.findAll('a', string=re.compile(r'\d{4}-\d{2}'))
  
  #Lasy year we know of 2017-2018
  last_active_year = int(years_in_html[-1].getText()[:4])+1

  #May need to fix this
  if date.today().year - last_active_year < 4:
    return 0
  return 1

In [13]:
def get_table_dictionary(soup, table_id, peak_stats = []):
  """

  Extracts player statistics from a table and returns in the form of a
  dictionary

  Args:
    soup: Soup extracted from an individual player's website on Basketball
     Reference
    table_id: ID tag for table in HTML denoting the type of statistics table to
     extract from (e.g. per_game, advanced, totals)

  
  Returns:
    [dict]: Returns a dictionary with the name of the statistic as the key
      and its corresponding value (based on the table- can be per game or
      totaled)

  """

  table = soup.find(id=table_id)
  table_cols_uncleaned = table.find('thead').find('tr')
  table_vals = table.find('tfoot').find('tr')

  table_cols = [col for col in table_cols_uncleaned if type(col) != bs4.element.NavigableString]

  table_dict = {f"{col.getText().replace(' ', '_')}_{table_id}":val.getText() for col, val in zip(table_cols, table_vals)}

  keys_to_remove = ['Age', 'Lg', 'Pos', 'Season', 'Tm', '\xa0']

  for key in keys_to_remove:
    table_dict.pop(f'{key}_{table_id}', None)

  #Will only be for advanced- Win shares for now
  #Grab all stats from one column and get their max/peak
  for peak_stat in peak_stats:
    peak_stat_tds = table.find('tbody').findAll(attrs={'data-stat':peak_stat})
    peak_stat_values = [float(td.getText()) for td in peak_stat_tds if td.getText() != '']
    table_dict[f'peak_{peak_stat}_{table_id}'] = max(peak_stat_values) if len(peak_stat_values) > 0 else 0

  return table_dict

In [14]:
def multiple_replace(dict, text):
  """
  Applies multiple replaces in string based on dictionary
  Args:
      dict ([type]): Dictionary with keys as phrase to be replaced, vals as phrase to replace key
      text ([type]): Text to apply string replaces to
  Returns:
      [type]: [description]
  """
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

In [15]:
def get_accolade_dictionary(soup):
  """

  Extracts player accolades from their website and returns in the form of a
  dictionary


  Args:
    soup: Soup extracted from an individual player's website on Basketball
     Reference

  
  Returns:
    [dict]: Returns a dictionary with the name of the accolade as the key
      and the number of times the player achieved it

  """

  #Hard coded as will only use this function for accolades, as other data comes
  #in table form

  bling = soup.find(id='bling')

  soup_text = soup.get_text()

  #Find Position in text and get substring based on bullets, strip other symbols
  #If multiple positions, grab first
  start_index = soup_text.find('Position:')
  end_index = soup_text.find('▪', start_index)
  positions = re.sub(r"[\n\t\s]*", "", soup_text[start_index:end_index]).split(':')[1]
  position = re.split(r',|and',positions)[0]
  accolade_dict = {'Position': position}

  if bling == None:
    return accolade_dict

  hof_player = 'Inducted as Player' in soup_text

  #Couldn't do simple BAA -> NBA replace as often put BAA/NBA
  replacer_dict = {
    'All-BAA/NBA': 'All_NBA',
    'BAA/NBA Champ': 'NBA_Champ',
    'BAA Champ': 'NBA_Champ',
    'All-BAA': 'All_NBA',
    ' *': '',
    ' ' : '_',
    '-' : '_',
    '.': ''
  }

  #For now
  accolades_list = [li.getText() for li in bling.findAll('li')]

  #First two are unfair to use, Conference Finals is new, others irrelevant
  cols_to_skip = ['NBA 75th Anniv. Team', 'ABA All-Time Team', 'Oscar', 'WCF MVP', 'DWTS Champ', 'ECF MVP']

  for i, accolade in enumerate(accolades_list):
    
    #Had to do this as Oscar, DWTS, WCF/ECF are year-specific
    
    skip_accolade = False

    for col_to_skip in cols_to_skip:
      if col_to_skip in accolade:
        skip_accolade = True
        break
    
    
    if skip_accolade or (accolade == 'Hall of Fame' and not hof_player):
      continue
        
    accolade = accolade.replace('Hall of Fame', '1x Hall of Fame')

    #For cases:
    #1978-79 NBA Champ
    #1978 Scoring Leader
    accolade = re.sub(r'(\d{4}-\d{2})|(\d{4})', '1x', accolade)
    #accolade = re.sub(r'\d{4}-\d{2}', '1x', accolade)
    
    parts = accolade.split('x ')

    accolade_name = multiple_replace(replacer_dict, parts[1])

    accolade_dict[accolade_name] = parts[0]
  
  return accolade_dict

In [16]:
def dict_to_list(data_dict, cols_list, default_value):
  """

  Args:
    data_dict: Dictionary holding statistics/accolades corresponding to cols_list
    cols_list: List of columns in order for output DF
    default_value: Default value to fill if statistic/accolade not found


  Returns:
    List of accolade/statistics in order of columns in output DF

  """

  return [data_dict.get(col, default_value) for col in cols_list]

In [17]:
def get_seasonal_dictionary(soup):
  """

  Extracts seasonal rankings (in stats such as Pts, MVP voting) and weights them by ranking
  Returns in the form of a
  dictionary


  Args:
    soup: Soup extracted from an individual player's website on Basketball
     Reference

  
  Returns:
    [dict]: Returns a dictionary with the name of the statistic as the key
      and the value as the weighted seasonal stat for a given player 

  """

  #Not currently including but should consider
  #leaderboard_stl_per_g, leaderboard_blk_per_g, leaderboard_per, leaderboard_ws
  #leaderboard_ows, leaderboard_dws, leaderboard_bpm, leaderboard_vorp

  stat_ids = ['leaderboard_pts_per_g', 'leaderboard_mvp_shares', 'leaderboard_trb_per_g', 'leaderboard_ast_per_g', 'leaderboard_ws']

  all_leaderboard = soup.find(id="all_leaderboard")

  #Most players are not in top 10 for any statistics
  if all_leaderboard == None:
    return {}

  comment_replace_dict = {
    "<!--":"",
    "-->":""
  }

  #The leaderboard HTML elements are commented out so uncomment them
  uncommented_text = multiple_replace(comment_replace_dict, str(all_leaderboard))

  uncommented_soup = BeautifulSoup(uncommented_text, features="lxml")

  seasonal_dictionary = {}

  for stat_id in stat_ids:
    stat_name = stat_id[stat_id.find("_")+1:] + "_seasonal"
    seasonal_table = uncommented_soup.find(id=stat_id)

    #Occurs if player is on a leaderboard but not this specific one- common
    if seasonal_table == None:
      continue


    seasonal_list = [tr.getText().strip() for tr in seasonal_table.findAll('tr')]
    #Does not include career leaderboards, other data (Career, Active, Playoffs, etc)
    #May consider including career leaderboards
    yearly_filter = re.compile(r'(\d{4}-\d{2})|(\d{4}).*')

    filtered_seasonal_list = list(filter(yearly_filter.match, seasonal_list))

    top_ten_pts = 0

    for year in filtered_seasonal_list:
      year_sections = year.split(' ')
      #Grabbing placement in string format (9)- get just the integer
      #Could have just removed parentheses
      placement = int(re.search(r'\d+',  year_sections[3]).group())
      if placement <= 10:
        #If placement = 1, 10 points
        #If placement = 10, 1 point
        top_ten_pts += (11 - placement)

    seasonal_dictionary[stat_name] = top_ten_pts

    #Rather than add placement, adds actual percent of MVP vote share
    if stat_id == 'leaderboard_mvp_shares':
      seasonal_dictionary['accum_mvp_shares_seasonal'] = sum([float(year.split(' ')[2]) for year in filtered_seasonal_list])
  
  return seasonal_dictionary

In [18]:
#TODO: Address Year discontinuities in stats

In [23]:
def scrape_player_data():
  """

  Scrapes accolade and career total, advanced, and per game statistics
  from every NBA player and outputs it into a CSV

  """

  letters = list(string.ascii_lowercase)

  #If forbidden error when urlopen
  #https://stackoverflow.com/questions/20968562/how-to-convert-a-bs4-element-resultset-to-strings-python
  #hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
  #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  #     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
  #     'Accept-Encoding': 'none',
  #     'Accept-Language': 'en-US,en;q=0.8',
  #     'Connection': 'keep-alive'}

  #List of columns to collect
  #Need this as players from diff eras had diff stats
  other_cols = ['Player', 'Eligible']
  accolade_cols = ['Position', 'Hall_of_Fame', 'MVP', 'Finals_MVP', 'NBA_Champ', 'All_NBA', 'All_Defensive',  'Def_POY', 'All_Star', 'Scoring_Champ', 'TRB_Champ', 'AST_Champ',  'STL_Champ', 'BLK_Champ', 'All_ABA', 'ABA_Champ', 'ROY']
  per_game_cols = ['MP_per_game', 'FG_per_game', 'FGA_per_game', '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',  'FT_per_game', 'FTA_per_game',  'ORB_per_game', 'DRB_per_game', 'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',  'PTS_per_game']
  totals_cols = ['MP_totals', 'G_totals', 'GS_totals', 'FG_totals', 'FGA_totals', 'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals', '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals', 'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals']
  seasonal_cols = ['pts_per_g_seasonal', 'mvp_shares_seasonal', 'trb_per_g_seasonal', 'ast_per_g_seasonal', 'ws_seasonal', 'accum_mvp_shares_seasonal']
  advanced_cols = ['PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced', 'WS_advanced', 'WS/48_advanced', 'OBPM_advanced', 'DBPM_advanced', 'BPM_advanced', 'VORP_advanced', 'peak_ws_advanced']

  final_cols = other_cols + accolade_cols + per_game_cols + totals_cols + seasonal_cols + advanced_cols

  all_data = []

  #Stat to collect peak for, could make this its own list to add to final_cols
  peak_stats = ['ws']

  for letter in letters:

    #Keep track of progress
    print(letter)

    #If no forbidden error
    letter_url = f"https://www.basketball-reference.com/players/{letter}/"
    letter_html = urlopen(letter_url)

    #If get forbidden error
    #req_letter = Request(letter_url, headers=hdr)
    #letter_html = urlopen(req_letter)


    letter_soup = BeautifulSoup(letter_html, features="lxml")

    #Find all players at url

    ths = letter_soup.findAll('tbody')[0].findAll('th')
    player_dict = {th.getText().replace('*', ''):th.find('a')['href'] for th in ths}

    for player in player_dict:

      href = player_dict[player]

      #If no forbidden error
      
      #Find all players at url
      player_url = f"https://www.basketball-reference.com/{href}"
      player_html = urlopen(player_url)

      #If get forbidden error
      #req_player = Request(player_url, headers=hdr)
      #player_html = urlopen(req_player)

      player_soup = BeautifulSoup(player_html, features="lxml")


      #Accolades
      #May need something for if this is non-existent
      #Will also have to base it off of what is there- things there will vary
      accolade_dict = get_accolade_dictionary(player_soup)
      per_game_dict = get_table_dictionary(player_soup, 'per_game')
      totals_dict = get_table_dictionary(player_soup, 'totals')
      seasonal_dict = get_seasonal_dictionary(player_soup)

      #Advanced
      try:
        advanced_dict = get_table_dictionary(player_soup, 'advanced', peak_stats)
      except:
        print("Exception: ")
        print(player)
        continue

      #Will put 0s for accolades that did not exist- should look at how long
      # each has existed
      accolade_list = dict_to_list(accolade_dict, accolade_cols, 0)

      #That means they never ranked top 10- get 0, not an NA as stats existed
      seasonal_list = dict_to_list(seasonal_dict, seasonal_cols, 0)

      #NAs for stats not recorded
      per_game_list = dict_to_list(per_game_dict, per_game_cols, -999)
      totals_list = dict_to_list(totals_dict, totals_cols, -999)
      advanced_list = dict_to_list(advanced_dict, advanced_cols, -999)

      #Eligibility- If ended after 2018 then can join 2022
      #Active will be train/val and test split
      eligible = getEligible(player_soup.find(id='per_game'))

      other_list = [player, eligible]

      current_player_data = other_list + accolade_list + per_game_list + totals_list + seasonal_list + advanced_list

      all_data.append(current_player_data)

      time.sleep(5)
      
  output_df = pd.DataFrame(data=all_data, columns = final_cols)

  return output_df

In [25]:
player_df = scrape_player_data()

a


HTTPError: ignored

### Fill Data for Erroneous Player(s)

In [None]:
additional_players = pd.read_csv("https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Erroneous%20Players.csv")

In [None]:
final_df = pd.concat([player_df, additional_players], ignore_index=True)

In [None]:
final_df.tail()

Unnamed: 0,Player,Eligible,Position,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,...,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced,peak_ws_advanced
4972,Jim Zoet,1,Center,0,0,0,0,0,0,0,...,0.2,-0.1,0.0,-0.1,-0.123,-5.6,0.2,-5.4,-0.1,-0.1
4973,Bill Zopf,1,PointGuard,0,0,0,0,0,0,0,...,0.391,-0.5,0.4,-0.1,-0.011,-999.0,-999.0,-999.0,-999.0,-0.1
4974,Ivica Zubac,0,Center,0,0,0,0,0,0,0,...,0.636,16.4,9.6,26.1,0.183,0.3,0.4,0.6,4.5,7.2
4975,Matt Zunic,1,Guard/Forward,0,0,0,0,0,0,0,...,0.368,0.2,1.8,2.0,,-999.0,-999.0,-999.0,-999.0,2.0
4976,Ed Fleming,1,ShootingGuard,0,0,0,0,0,0,0,...,0.436,3.3,6.5,9.8,0.076,-999.0,-999.0,-999.0,-999.0,


In [None]:
len(final_df)

4977

In [None]:
final_df.to_csv('Scraped Player Data.csv')