<a href="https://colab.research.google.com/github/frankwillard/NBA-Hall-Of-Fame-Model/blob/main/Player_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import bs4
import re

In [None]:
#Must decide variables

#Accolades:
#All_Star, MVP, All_NBA_1, All_NBA_2, All_NBA_3, DPOY, Champion
#Finals_MVP, All_D_1, All_D_2
#PTS_Champ, STL_Champ, REB_Champ, AST_Champ, BLK_Champ
#ROTY

#Average and Career Stats
#Pts, Reb, Assists, Steals, Blocks, FG%, 3P%, 3PM

#PER, VORP, BPM, WS, WS/48

#VORP_1?, WS_1?, PER_1?

In [None]:
#Could always add MVP vote shares, rankings in stats within a season, playoff performance

In [None]:
def getEligible(table):
  years_in_html = table.findAll('a', string=re.compile(r'\d{4}-\d{2}'))
  last_active_year = int(years_in_html[-1].getText()[:4])+1
  if last_active_year > 2018:
    return True
  return False

In [None]:
def get_table_dictionary(soup, table_id):

  table = soup.find(id=table_id)
  table_cols_uncleaned = table.findAll('thead')[0].findAll('tr')[0]
  table_vals = table.findAll('tfoot')[0].findAll('tr')[0]
  table_cols = [col.replace(' ', '_') for col in table_cols_uncleaned if type(col) != bs4.element.NavigableString]

  table_dict = {f"{col.getText()}_{table_id}":val.getText() for col, val in zip(table_cols, table_vals)}

  keys_to_remove = ['Age', 'Lg', 'Pos', 'Season', 'Tm', '\xa0']

  for key in keys_to_remove:
    table_dict.pop(f'{key}_{table_id}', None)

  return table_dict

In [None]:
def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

In [None]:
def get_accolade_dictionary(bling, hof_player, player, accolade_cols):

  replacer_dict = {
    'All-BAA/NBA': 'All_NBA',
    'BAA/NBA Champ': 'NBA_Champ',
    'BAA Champ': 'NBA_Champ',
    'All-BAA': 'All_NBA',
    ' *': '',
    ' ' : '_',
    '-' : '_',
    '.': ''
  }

  #For now
  accolades_list = [li.getText() for li in bling.findAll('li')]
  accolade_dict = {}

  cols_to_skip = ['NBA 75th Anniv. Team', 'ABA All-Time Team', 'Oscar', 'WCF MVP', 'DWTS Champ', 'ECF MVP']

  for i, accolade in enumerate(accolades_list):
    
    #Had to do this as Oscar, DWTS, WCF/ECF are year-specific
    
    skip_accolade = False

    for col in cols_to_skip:
      if col in accolade:
        skip_accolade = True
        break
    

    if skip_accolade or (accolade == 'Hall of Fame' and not hof_player):
      continue
        
    accolade = accolade.replace('Hall of Fame', '1x Hall of Fame')

    #For cases:
    #1978-79 NBA Champ
    #1978 Scoring Leader
    accolade = re.sub(r'(\d{4}-\d{2})|(\d{4})', '1x', accolade)
    #accolade = re.sub(r'\d{4}-\d{2}', '1x', accolade)
    
    #preprocessed_accolade = accolade.replace()
    parts = accolade.split('x ')

    if len(parts) < 2:
      print(parts)

    accolade_name = multiple_replace(replacer_dict, parts[1])

    if accolade_name not in accolade_cols:
      print(player, accolade_name)

    accolade_dict[accolade_name] = parts[0]
  
  return accolade_dict

In [None]:
#Year discontinuities in stats
#Remove redundant columns
#Create column order and add stats in dicts to DF in that order
#Change dicts to defaultdict

In [None]:
def scrape_player_data():

  all_accolades = []

  letters = list(string.ascii_lowercase)

  #TODO: Determine columns of interest

  #All_BAA/NBA
  #BAA/NBA_Champ
  #BAA_Champ

  accolade_cols = ['Hall_of_Fame', 'All_Star', 'Scoring_Champ', 'TRB_Champ', 'BLK_Champ', 'NBA_Champ', 'All_NBA', 'All_Defensive', 'All_Rookie', 'ROY', 'Finals_MVP', 'MVP', 'Most_Improved', 'Def_POY', 'AS_MVP', 'AST_Champ', 'Sixth_Man', 'ABA_Champ', 'All_ABA', 'STL_Champ']

  #Feature Selection

  #Redundant variables:
  #G and GS- use totals
  #FG%, 3P%, 2P%, eFG%, FT%

  #Unnecessary variables:
  #Remove MP, PF, TOV

  #Advanced remove: ['3PAr_advanced', 'FTr_advanced', 'ORB%_advanced', 'DRB%_advanced', 'TRB%_advanced', 'AST%_advanced', 'STL%_advanced', 'BLK%_advanced', 'TOV%_advanced', 'USG%_advanced']

  per_game_cols = ['FG_per_game', 'FGA_per_game', '3P_per_game', '3PA_per_game', '2P_per_game', '2PA_per_game',  'FT_per_game', 'FTA_per_game',  'ORB_per_game', 'DRB_per_game', 'TRB_per_game', 'AST_per_game', 'STL_per_game', 'BLK_per_game',  'PTS_per_game']
  totals_cols = ['G_totals', 'GS_totals', 'FG_totals', 'FGA_totals', 'FG%_totals', '3P_totals', '3PA_totals', '3P%_totals', '2P_totals', '2PA_totals', '2P%_totals', 'eFG%_totals', 'FT_totals', 'FTA_totals', 'FT%_totals', 'ORB_totals', 'DRB_totals', 'TRB_totals', 'AST_totals', 'STL_totals', 'BLK_totals', 'PTS_totals', 'Trp_Dbl_totals']
  advanced_cols = ['PER_advanced', 'TS%_advanced', 'OWS_advanced', 'DWS_advanced', 'WS_advanced', 'WS/48_advanced', 'OBPM_advanced', 'DBPM_advanced', 'BPM_advanced', 'VORP_advanced']

  #final_cols

  for letter in letters:

    print(letter)

    letter_url = f"https://www.basketball-reference.com/players/{letter}/"

    letter_html = urlopen(letter_url)

    letter_soup = BeautifulSoup(letter_html, features="lxml")

    #Find all players at url

    ths = letter_soup.findAll('tbody')[0].findAll('th')
    player_dict = {th.getText().replace('*', ''):th.find('a')['href'] for th in ths}
    
    #Old code
    #dict((th.getText(),th.find('a')['href'] ) for th in ths)
    #href_list = [th.find('a')['href'] for th in ths]

    for player in player_dict:

      #print(player)

      href = player_dict[player]

      player_url = f"https://www.basketball-reference.com/{href}"

      player_html = urlopen(player_url)

      player_soup = BeautifulSoup(player_html, features="lxml")

      #Find all players at url


      #Accolades
      #May need something for if this is non-existent
      #Will also have to base it off of what is there- things there will vary
      bling = player_soup.find(id='bling')
      
      if bling != None:
        hof_player = 'Inducted as Player' in player_soup.get_text()
        accolade_dict = get_accolade_dictionary(bling, hof_player, player, accolade_cols)
      else:
        accolade_dict = {}

      #Per Game
      #per_game_dict = get_table_dictionary(player_soup, 'per_game')

      #Totals
      #totals_dict = get_table_dictionary(player_soup, 'totals')

      #Advanced
      #advanced_dict = get_table_dictionary(player_soup, 'advanced')

      #Eligibility- If ended after 2018 then can join 2022
      #Active will be train/val and test split
      #eligible = getEligible(player_soup.find(id='per_game'))


      #all_data

  
  #print(all_accolades)
    
  #final_df = pd.DataFrame(data=all_data, columns = final_cols)
  
  #Sort if desired
  #final_df = final_df.sort_values(by=[])

  #final_df.to_csv('')

In [None]:
scrape_player_data()