<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Player_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [22]:
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import bs4
import re
from datetime import date

### Preliminary Feature Selection

Here, we remove statistics that are irrelevant, redundant, or perhaps unnecessary for our use

For accolades we remove 'NBA 75th Anniv. Team', 'ABA All-Time Team', 'Oscar', 'WCF MVP', 'DWTS Champ', 'ECF MVP', 'AS_MVP', 'All_Rookie', 'Most_Improved', and 'Sixth_Man'

Many tables hold data for G, GS, FG%, 3P%, 2P%, eFG%, FT% that encodes the same information so we use totals

We also do not include information such as MP, PF, TOV from the per-game and totals tables.

From the advanced table, we do not include information such as '3PAr_advanced', 'FTr_advanced', 'ORB%_advanced', 'DRB%_advanced', 'TRB%_advanced', 'AST%_advanced', 'STL%_advanced', 'BLK%_advanced', 'TOV%_advanced', and 
'USG%_advanced'

### Define Functions

In [23]:
def getEligible(table):
  """

  Determines if a player is eligible to be voted for the Hall of Fame based on
  the last year they played in the league


  Args:
    table: Table holding player statistics for final year to be extracted from

  
  Returns:
    [bool]: Returns a boolean indicating whether a player is eligible for the
      Hall of Fame

  """
  #TODO: Ensure year difference is consistent with reality

  years_in_html = table.findAll('a', string=re.compile(r'\d{4}-\d{2}'))
  
  #Lasy year we know of 2017-2018
  last_active_year = int(years_in_html[-1].getText()[:4])+1

  #May need to fix this
  if date.today().year - last_active_year < 4:
    return 0
  return 1

In [24]:
def get_table_dictionary(soup, table_id):
  """

  Extracts player statistics from a table and returns in the form of a
  dictionary

  Args:
    soup: Soup extracted from an individual player's website on Basketball
     Reference
    table_id: ID tag for table in HTML denoting the type of statistics table to
     extract from (e.g. per_game, advanced, totals)

  
  Returns:
    [dict]: Returns a dictionary with the name of the statistic as the key
      and its corresponding value (based on the table- can be per game or
      totaled)

  """

  table = soup.find(id=table_id)
  table_cols_uncleaned = table.find('thead').find('tr')
  table_vals = table.find('tfoot').find('tr')

  table_cols = [col for col in table_cols_uncleaned if type(col) != bs4.element.NavigableString]

  table_dict = {f"{col.getText().replace(' ', '_')}_{table_id}":val.getText() for col, val in zip(table_cols, table_vals)}

  keys_to_remove = ['Age', 'Lg', 'Pos', 'Season', 'Tm', '\xa0']

  for key in keys_to_remove:
    table_dict.pop(f'{key}_{table_id}', None)

  return table_dict

In [25]:
def multiple_replace(dict, text):
  """
  Applies multiple replaces in string based on dictionary
  Args:
      dict ([type]): Dictionary with keys as phrase to be replaced, vals as phrase to replace key
      text ([type]): Text to apply string replaces to
  Returns:
      [type]: [description]
  """
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

In [26]:
def get_accolade_dictionary(soup):
  """

  Extracts player accolades from their website and returns in the form of a
  dictionary


  Args:
    soup: Soup extracted from an individual player's website on Basketball
     Reference

  
  Returns:
    [dict]: Returns a dictionary with the name of the accolade as the key
      and the number of times the player achieved it

  """

  #Hard coded as will only use this function for accolades, as other data comes
  #in table form

  bling = soup.find(id='bling')

  if bling == None:
    return {}

  hof_player = 'Inducted as Player' in soup.get_text()

  #Couldn't do simple BAA -> NBA replace as often put BAA/NBA
  replacer_dict = {
    'All-BAA/NBA': 'All_NBA',
    'BAA/NBA Champ': 'NBA_Champ',
    'BAA Champ': 'NBA_Champ',
    'All-BAA': 'All_NBA',
    ' *': '',
    ' ' : '_',
    '-' : '_',
    '.': ''
  }

  #For now
  accolades_list = [li.getText() for li in bling.findAll('li')]
  accolade_dict = {}

  cols_to_skip = ['NBA 75th Anniv. Team', 'ABA All-Time Team', 'Oscar', 'WCF MVP', 'DWTS Champ', 'ECF MVP']

  for i, accolade in enumerate(accolades_list):
    
    #Had to do this as Oscar, DWTS, WCF/ECF are year-specific
    
    skip_accolade = False

    for col_to_skip in cols_to_skip:
      if col_to_skip in accolade:
        skip_accolade = True
        break
    

    if skip_accolade or (accolade == 'Hall of Fame' and not hof_player):
      continue
        
    accolade = accolade.replace('Hall of Fame', '1x Hall of Fame')

    #For cases:
    #1978-79 NBA Champ
    #1978 Scoring Leader
    accolade = re.sub(r'(\d{4}-\d{2})|(\d{4})', '1x', accolade)
    #accolade = re.sub(r'\d{4}-\d{2}', '1x', accolade)
    
    parts = accolade.split('x ')

    accolade_name = multiple_replace(replacer_dict, parts[1])

    accolade_dict[accolade_name] = parts[0]
  
  return accolade_dict

In [27]:
def dict_to_list(data_dict, cols_list, default_value):
  """

  Args:
    data_dict: Dictionary holding statistics/accolades corresponding to cols_list
    cols_list: List of columns in order for output DF
    default_value: Default value to fill if statistic/accolade not found


  Returns:
    List of accolade/statistics in order of columns in output DF

  """

  return [data_dict.get(col, default_value) for col in cols_list]

In [28]:
#Year discontinuities in stats
#Create column order to write stats in (probably want to rearrange for cohesion)
#Write something to add stats from dicts to DF in that order
  #Use dict.get('key', 0) for each col- 0 will be default

In [29]:
def scrape_player_data():
  """

  Scrapes accolade and career total, advanced, and per game statistics
  from every NBA player and outputs it into a CSV

  """

  all_accolades = []

  letters = list(string.ascii_lowercase)


  other_cols = ['Player', 'Eligible']
  accolade_cols = ['Hall_of_Fame', 'MVP', 'Finals_MVP', 'NBA_Champ', 'All_NBA', 'All_Defensive',  'Def_POY', 'All_Star', 'Scoring_Champ', 'TRB_Champ', 'AST_Champ',  'STL_Champ', 'BLK_Champ', 'All_ABA', 'ABA_Champ', 'ROY']
  per_game_cols = ['FG_per_game', 'FGA_per_game', '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',  'FT_per_game', 'FTA_per_game',  'ORB_per_game', 'DRB_per_game', 'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',  'PTS_per_game']
  totals_cols = ['G_totals', 'GS_totals', 'FG_totals', 'FGA_totals', 'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals', '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals', 'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals']
  advanced_cols = ['PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced', 'WS_advanced', 'WS/48_advanced', 'OBPM_advanced', 'DBPM_advanced', 'BPM_advanced', 'VORP_advanced']

  final_cols = other_cols + accolade_cols + per_game_cols + totals_cols + advanced_cols

  all_data = []

  for letter in letters:

    print(letter)

    letter_url = f"https://www.basketball-reference.com/players/{letter}/"

    letter_html = urlopen(letter_url)

    letter_soup = BeautifulSoup(letter_html, features="lxml")

    #Find all players at url

    ths = letter_soup.findAll('tbody')[0].findAll('th')
    player_dict = {th.getText().replace('*', ''):th.find('a')['href'] for th in ths}


    for player in player_dict:

      href = player_dict[player]

      player_url = f"https://www.basketball-reference.com/{href}"

      player_html = urlopen(player_url)

      player_soup = BeautifulSoup(player_html, features="lxml")

      #Find all players at url


      #Accolades
      #May need something for if this is non-existent
      #Will also have to base it off of what is there- things there will vary
      accolade_dict = get_accolade_dictionary(player_soup)
      per_game_dict = get_table_dictionary(player_soup, 'per_game')
      totals_dict = get_table_dictionary(player_soup, 'totals')

      #Advanced
      try:
        advanced_dict = get_table_dictionary(player_soup, 'advanced')
      except:
        print("Exception: ")
        print(player)
        continue

      #Will put 0s for accolades that did not exist- should look at how long
      # each has existed
      accolade_list = dict_to_list(accolade_dict, accolade_cols, 0)

      #NAs for stats not recorded
      per_game_list = dict_to_list(per_game_dict, per_game_cols, -999)
      totals_list = dict_to_list(totals_dict, totals_cols, -999)
      advanced_list = dict_to_list(advanced_dict, advanced_cols, -999)

      #Eligibility- If ended after 2018 then can join 2022
      #Active will be train/val and test split
      eligible = getEligible(player_soup.find(id='per_game'))

      other_list = [player, eligible]

      current_player_data = other_list + accolade_list + per_game_list + totals_list + advanced_list

      all_data.append(current_player_data)
      
  output_df = pd.DataFrame(data=all_data, columns = final_cols)

  return output_df

In [30]:
player_df = scrape_player_data()

a
b
c
d
e
f
Exception: 
Ed Fleming
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z


In [31]:
player_df.head()

Unnamed: 0,Player,Eligible,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,All_Star,...,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced
0,Alaa Abdelnaby,1,0,0,0,0,0,0,0,0,...,13.0,0.532,0.7,4.1,4.8,0.072,-2.9,-0.9,-3.8,-1.5
1,Zaid Abdul-Aziz,1,0,0,0,0,0,0,0,0,...,15.1,0.479,5.9,11.6,17.5,0.076,0.6,-0.2,0.4,2.7
2,Kareem Abdul-Jabbar,1,1,6,2,6,15,11,0,19,...,24.6,0.592,178.9,94.5,273.4,0.228,4.1,1.6,5.7,85.7
3,Mahmoud Abdul-Rauf,1,0,0,0,0,0,0,0,0,...,15.4,0.506,16.7,8.4,25.2,0.077,0.7,-1.5,-0.8,4.5
4,Tariq Abdul-Wahad,1,0,0,0,0,0,0,0,0,...,11.4,0.467,-0.6,4.1,3.5,0.035,-2.6,-0.4,-3.0,-1.2


### Fill Data for Erroneous Player(s)

In [34]:
additional_players = pd.read_csv('https://raw.githubusercontent.com/frankwillard/NBA-Hall-Of-Fame-Model/main/Additional%20Players%20-%20Sheet1.csv')

In [36]:
final_df = pd.concat([player_df, additional_players], ignore_index=True)

In [37]:
final_df.tail()

Unnamed: 0,Player,Eligible,Hall_of_Fame,MVP,Finals_MVP,NBA_Champ,All_NBA,All_Defensive,Def_POY,All_Star,...,PER_advanced,TS%_advanced,OWS_advanced,DWS_advanced,WS_advanced,WS/48_advanced,OBPM_advanced,DBPM_advanced,BPM_advanced,VORP_advanced
4971,Jim Zoet,1,0,0,0,0,0,0,0,0,...,-0.8,0.2,-0.1,0.0,-0.1,-0.123,-5.6,0.2,-5.4,-0.1
4972,Bill Zopf,1,0,0,0,0,0,0,0,0,...,9.6,0.391,-0.5,0.4,-0.1,-0.011,-999.0,-999.0,-999.0,-999.0
4973,Ivica Zubac,0,0,0,0,0,0,0,0,0,...,19.2,0.636,16.4,9.6,26.1,0.183,0.3,0.4,0.6,4.5
4974,Matt Zunic,1,0,0,0,0,0,0,0,0,...,,0.368,0.2,1.8,2.0,,-999.0,-999.0,-999.0,-999.0
4975,Ed Fleming,1,0,0,0,0,0,0,0,0,...,14.1,0.436,3.3,6.5,9.8,0.076,-999.0,-999.0,-999.0,-999.0


In [43]:
#from google.colab import files
#final_df.to_csv('Scraped Player Data.csv')
#files.download('Scraped Player Data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>