<a href="https://colab.research.google.com/github/frankwillard/NBA-Rookie-Success-ML-Model/blob/main/Rookie_Success_Player_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# May be in our interest to make a more generalizable set of functions in a script that we then import in each scraping notebook

In [10]:
import string
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import bs4
import re
from datetime import date

In [11]:
def multiple_replace(dict, text):
  """
  Applies multiple replaces in string based on dictionary
  Args:
      dict ([type]): Dictionary with keys as phrase to be replaced, vals as phrase to replace key
      text ([type]): Text to apply string replaces to
  Returns:
      [type]: [description]
  """
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

In [12]:
def dict_to_list(data_dict, cols_list, default_value):
  """

  Args:
    data_dict: Dictionary holding statistics/accolades corresponding to cols_list
    cols_list: List of columns in order for output DF
    default_value: Default value to fill if statistic/accolade not found


  Returns:
    List of accolade/statistics in order of columns in output DF

  """

  return [data_dict.get(col, default_value) for col in cols_list]

In [13]:
def get_physical_dictionary(soup):

  soup_text = soup.get_text()

  #Find Position in text and get substring based on bullets, strip other symbols
  #If multiple positions, grab first
  start_index = soup_text.find('Position:')
  end_index = soup_text.find(r'School:', start_index)

  physical_data = re.sub(r"[\n\t\s]*", "", soup_text[start_index:end_index])
  
  #Finds number- indicates height
  end_position = re.search(r"\d", physical_data).start()
  positions = physical_data[:end_position].split(':')[1]
  position = re.split(r',|and',positions)[0]
  
  end_height = physical_data.find(',')
  height_string = physical_data[end_position:end_height]
  ft, inches = height_string.split('-')
  height = int(ft) * 12 + int(inches)

  #Assumes a comma and a 3 digit weight
  weight = int(physical_data[end_height+1:end_height+4])

  physical_dict = {'Position': position}
  physical_dict['Height'] = height
  physical_dict['Weight'] = weight

  return physical_dict

In [14]:
def get_table_dictionary(soup, table_id, peak_stats = []):
  """

  Extracts player statistics from a table and returns in the form of a
  dictionary

  Args:
    soup: Soup extracted from an individual player's website on Basketball
     Reference
    table_id: ID tag for table in HTML denoting the type of statistics table to
     extract from (e.g. per_game, advanced, totals)

  
  Returns:
    [dict]: Returns a dictionary with the name of the statistic as the key
      and its corresponding value (based on the table- can be per game or
      totaled)

  """

  table = soup.find(id=f"players_{table_id}")
  table_cols_uncleaned = table.find('thead').find('tr')
  table_cols = [col for col in table_cols_uncleaned if type(col) != bs4.element.NavigableString]


  #No longer career per game
  #table_vals = table.find('tfoot').find('tr')

  #Instead we are getting the final season
  table_vals = table.findAll("tr")[-2]

  table_dict = {f"{col.getText().replace(' ', '_')}_{table_id}":val.getText() for col, val in zip(table_cols, table_vals)}

  #TODO: Change this (can make this line better anyway)
  keys_to_remove = ['Age', 'Lg', 'Pos', 'Tm', '\xa0']

  for key in keys_to_remove:
    table_dict.pop(f'{key}_{table_id}', None)
  
  # keys_to_replace = ['Season', 'Conf', 'G', 'GS', 'School']

  # for key in keys_to_replace:
  #   table_dict[key] = table_dict[f'{key}_{table_id}']
  #   table_dict.pop(f'{key}_{table_id}', None)


  #Will only be for advanced- Win shares for now
  #Grab all stats from one column and get their max/peak
  # for peak_stat in peak_stats:
  #   peak_stat_tds = table.find('tbody').findAll(attrs={'data-stat':peak_stat})
  #   peak_stat_values = [float(td.getText()) for td in peak_stat_tds if td.getText() != '']
  #   table_dict[f'peak_{peak_stat}_{table_id}'] = max(peak_stat_values) if len(peak_stat_values) > 0 else 0

  return table_dict

In [37]:
def get_accolade_dictionary(soup, accolade_cols):
  """

  Extracts player accolades from their website and returns in the form of a
  dictionary


  Args:
    soup: Soup extracted from an individual player's website on Basketball
     Reference

  
  Returns:
    [dict]: Returns a dictionary with the name of the accolade as the key
      and the number of times the player achieved it

  """

  # Accolade col (need to watch out for 2x, 3x, etc)

  # Conference wise accolades (For ACC, Big Ten, Pac 12, Big 12)
  # {year} All-{Conference}
  # {Conference} All-Freshman
  # {year} {Conference} POY
  # {Conference} All-Defense
  # {year} {Conference} ROY

  power_conferences = ['ACC', 'Pac-12', 'Big Ten', 'Big 12']

  # Naismith Award, {year} AP POY, NCAA All-Region, NCAA Champion, Wooden Award, Rupp Trophy, NCAA Tourney MOP


  #Hard coded as will only use this function for accolades, as other data comes
  #in table form

  bling = soup.find(id='bling')

  accolade_dict = {}

  if bling == None:
    return accolade_dict

  #Couldn't do simple BAA -> NBA replace as often put BAA/NBA
  replacer_dict = {
    'ACC': 'P4_Conference',
    'Big Ten': 'P4_Conference',
    'Pac-12': 'P4_Conference',
    'Big 12': 'P4_Conference', 
    ' *': '',
    ' ' : '_',
    '-' : '_',
    '.': ''
  }

  #For now
  accolades_list = [li.getText() for li in bling.findAll('li')]

  for i, accolade in enumerate(accolades_list):
    #For cases:
    #1978-79 NBA Champ
    #1978 Scoring Leader
    accolade = re.sub(r'(\d{4}-\d{2})|(\d{4})', '1x', accolade)
    #accolade = re.sub(r'\d{4}-\d{2}', '1x', accolade)
    accolade = multiple_replace(replacer_dict, accolade)

    parts = accolade.split('x_')
    accolade_name = parts[1] if len(parts) > 1 else parts[0]

    if accolade_name not in accolade_cols:
      continue 

    accolade_dict[accolade_name] = parts[0] if len(parts) > 1 else 1
    
  return accolade_dict

In [25]:
def scrape_player_data(combine_data, identifier_cols, physical_cols, per_game_cols, accolade_cols, advanced_cols):

  final_cols = identifier_cols + accolade_cols + per_game_cols + physical_cols + advanced_cols
  all_data = []

  #For every player in draft:
  for i, row in combine_data.iterrows():
    current_name = row['Player']
    draft_class = row['Draft Year']
    
    # Open href:
    href_name = current_name.lower().replace(' ', '-')
    player_url = f"https://www.sports-reference.com/cbb/players/{href_name}-1.html" 
      
    # player_url = f"https://www.sports-reference.com/cbb/players/{href}"
    player_html = urlopen(player_url)
    player_soup = BeautifulSoup(player_html, features="lxml")

    accolade_dict = get_accolade_dictionary(player_soup, accolade_cols)
    per_game_dict = get_table_dictionary(player_soup, 'per_game')
    physical_dict = get_physical_dictionary(player_soup)

    try:
      advanced_dict = get_table_dictionary(player_soup, 'advanced')
    except:
      print("Exception: ")
      print(current_name)
      continue

    # Will put 0s for accolades that did not exist- should look at how long
    # each has existed
    accolade_list = dict_to_list(accolade_dict, accolade_cols, 0)

    # NAs for stats not recorded
    per_game_list = dict_to_list(per_game_dict, per_game_cols, -999)
    physical_list = dict_to_list(physical_dict, physical_cols, -999)
    advanced_list = dict_to_list(advanced_dict, advanced_cols, -999)

    #Eligibility- If ended after 2018 then can join 2022
    #Active will be train/val and test split
    # eligible = getEligible(player_soup.find(id='per_game'))

    identifier_list = [current_name, draft_class]

    current_player_data = identifier_list + accolade_list + per_game_list + physical_list + advanced_list

    all_data.append(current_player_data)
      
  output_df = pd.DataFrame(data=all_data, columns = final_cols)

  return output_df

In [26]:
combine_data = pd.read_csv('https://raw.githubusercontent.com/frankwillard/NBA-Rookie-Success-ML-Model/main/NBA%20Combine%20Measurements%20-%202010-2022.csv')

In [42]:
# combine_data = combine_data.sort_values(by=['Player'])

In [27]:
identifier_cols = ['Player', 'Draft Year']

#College, Rankings, etc

accolade_cols = ['All_P4_Conference', 'P4_Conference_All_Freshman', 'P4_Conference_POY', 'P4_Conference_All_Defense', 'P4_Conference_ROY', 'Naismith_Award', 'AP_POY', 'NCAA_All_Region', 'NCAA_Champion', 'Wooden_Award', 'Rupp_Trophy', 'NCAA_Tourney_MOP']
# Should also collect year and something like "Eligible" in terms of having a test set of the last few draft classes
# identifier_cols = ['Player', 'Eligible']

physical_cols = ["Position", "Height", "Weight"]

# Currently include SOS as measure of strength of schedule
per_game_cols = ['Season_per_game','School_per_game','Conf_per_game','G_per_game','GS_per_game','MP_per_game','FG_per_game','FGA_per_game','FG%_per_game','2P_per_game','2PA_per_game','2P%_per_game','3P_per_game','3PA_per_game','3P%_per_game','FT_per_game','FTA_per_game','FT%_per_game','ORB_per_game','DRB_per_game','TRB_per_game','AST_per_game','STL_per_game','BLK_per_game','TOV_per_game','PF_per_game','PTS_per_game','SOS_per_game']

# Could look into other percentages like assist, eFG, PProd
advanced_cols = ['PER_advanced', 'TS%_advanced', 'PProd_advanced', 'OWS_advanced', 'DWS_advanced', 'WS_advanced', 'WS/40_advanced', 'OBPM_advanced', 'DBPM_advanced', 'BPM_advanced', 'VORP_advanced']

In [28]:
scrape_player_data(combine_data, identifier_cols, physical_cols, per_game_cols, accolade_cols, advanced_cols)

https://www.sports-reference.com/cbb/players/solomon-alabi-1.html
https://www.sports-reference.com/cbb/players/cole-aldrich-1.html
https://www.sports-reference.com/cbb/players/al-farouq-aminu-1.html
https://www.sports-reference.com/cbb/players/james-anderson-1.html
https://www.sports-reference.com/cbb/players/luke-babbitt-1.html
https://www.sports-reference.com/cbb/players/eric-bledsoe-1.html
https://www.sports-reference.com/cbb/players/trevor-booker-1.html
https://www.sports-reference.com/cbb/players/craig-brackins-1.html
https://www.sports-reference.com/cbb/players/avery-bradley-1.html
https://www.sports-reference.com/cbb/players/derrick-caracter-1.html
https://www.sports-reference.com/cbb/players/sherron-collins-1.html
https://www.sports-reference.com/cbb/players/demarcus-cousins-1.html
https://www.sports-reference.com/cbb/players/jordan-crawford-1.html
https://www.sports-reference.com/cbb/players/ed-davis-1.html
https://www.sports-reference.com/cbb/players/devin-ebanks-1.html
https