# Data Scraping NBA (Player) Salary Data from espn.com
#### Author: Ethan Eason | Project completed as work for TrueHoop CEO and Founder Henry Abbott

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Some constants and scraping functions first.

In [2]:
# We start at page 0 so that looping over pages is more elegant (see below)
ROOT_URL = "http://www.espn.com/nba/salaries/_/page/0/season_type/3"

# We want salary data going 9 seasons into the past (including LATEST_SEASON).
LATEST_YEAR = 2022
YEARS_BACK = 1
EARLIEST_YEAR = LATEST_YEAR - (YEARS_BACK - 1)

SALARY_IDENTIFIER = "$"
ENDSALARY_IDENTIFIER = "</"
PRENAME_IDENTIFIER = "/id/"
NAME_IDENTIFIER = ">"
ENDNAME_IDENTIFIER = "<"

def build_url(year):
    return ROOT_URL + "/year/" + str(year)

def is_valid_response(parsed_response):
    page_numbers_str = str(parsed_response.find_all("td"))
    return True if SALARY_IDENTIFIER in page_numbers_str else False

def incr_page_number(url, cur_page_number):
    cur_page_number_str = str(cur_page_number)
    cur_page_number_digits = len(cur_page_number_str)
    next_page_number_str = str(cur_page_number + 1)
    page_number_index = url.index(cur_page_number_str)
    return url[:page_number_index] + next_page_number_str + url[(page_number_index + cur_page_number_digits):]
    
def get_next_page(url, cur_page_number):
    next_page_url = incr_page_number(url, cur_page_number)
    page_response = requests.get(url).text
    parsed_response = BeautifulSoup(page_response, "html.parser")
    page_nonempty = is_valid_response(parsed_response)
    return (next_page_url, parsed_response) if page_nonempty else (None, None)

p = get_next_page("http://www.espn.com/nba/salaries/_/page/0/seasontype/3", 0)
def extract_salaries(parsed_response, year):
    page_tables = parsed_response.find_all("td")
    players = []
    salaries = []
    item_idx = 0
    while (item_idx < len(page_tables)):
        item_str = str(page_tables[item_idx])
        if PRENAME_IDENTIFIER in item_str:
            # Player name extraction
            prename_idx = item_str.index(PRENAME_IDENTIFIER)
            trimmed_item_str = item_str[prename_idx:]
            startname_idx = trimmed_item_str.index(NAME_IDENTIFIER)
            trimmed_item_str = trimmed_item_str[startname_idx:]
            endname_idx = trimmed_item_str.index(ENDNAME_IDENTIFIER)
            player_name = trimmed_item_str[1:endname_idx]
            players.append(player_name)
            
            # Salary extraction (salary items are two indices ahead of player name items)
            item_idx += 2
            salary_item_str = str(page_tables[item_idx])
            startsalary_idx = salary_item_str.index(SALARY_IDENTIFIER)
            endsalary_idx = salary_item_str.index(ENDSALARY_IDENTIFIER)
            trimmed_salary_item_str = salary_item_str[(startsalary_idx + 1):endsalary_idx]
            player_salary = int(trimmed_salary_item_str.replace(",", ""))
            salaries.append(player_salary)
            
        else:    
            item_idx += 1
    return pd.DataFrame({"Year" : ([year] * len(players)), "Player" : players, "Salary" : salaries})

### Now we scrape the salary data we want into a dataframe. 

In [3]:
all_salaries_df = pd.DataFrame({"Year" : [], "Player" : [], "Salary" : []})
for year in range(EARLIEST_YEAR, (LATEST_YEAR + 1)):
    year_url = build_url(year)
    cur_page_number = 0
    while True:
        (next_page_url, year_parsed_response) = get_next_page(year_url, cur_page_number)
        if year_parsed_response is None:
            break
        cur_salary_df = extract_salaries(year_parsed_response, year)
        all_salaries_df = pd.concat([all_salaries_df, cur_salary_df], axis=0)
        cur_page_number += 1
        year_url = next_page_url
        
all_salaries_df

Unnamed: 0,Year,Player,Salary
0,2022.0,Stephen Curry,45780966.0
1,2022.0,James Harden,44310840.0
2,2022.0,John Wall,44310840.0
3,2022.0,Russell Westbrook,44211146.0
4,2022.0,Kevin Durant,42018900.0
...,...,...,...
16,2022.0,Juwan Morgan,19816.0
17,2022.0,Moses Brown,19186.0
18,2022.0,Xavier Sneed,8558.0
19,2022.0,Ish Wainright,5318.0


### Read in the base data set we will inject salary data into.

In [39]:
bdf = pd.read_csv("/Users/ethaneason/college-projects/nba_salary_scraper/base_sheet.csv")

In [27]:
bdf

Unnamed: 0,2015,4.40108,201166,Aaron Brooks,PG,CHI,1.09155,-0.604692,0.48686
0,2014,1.51368,201166,Aaron Brooks,PG,DEN,-0.326643,-1.2328,-1.55944
1,2016,1.3966,201166,Aaron Brooks,PG,CHI,-0.540226,-0.551815,-1.09204
2,2013,0.782241,201166,Aaron Brooks,PG,HOU,-0.291699,-1.5423,-1.834
3,2017,0.182092,201166,Aaron Brooks,PG,IND,-1.82871,-0.821934,-2.65065
4,2018,-0.147747,201166,Aaron Brooks,PG,MIN,-1.72914,-2.365,-4.09414
...,...,...,...,...,...,...,...,...,...
5297,2020,2.07335,1629627,Zion Williamson,PF,NOP,2.40089,-0.788767,1.61213
5298,2022,,1629627,Zion Williamson,,NOP,,,
5299,2015,-0.0136304,204054,Zoran Dragic,SG,MIA,-2.28535,-0.923961,-3.20931
5300,2022,-0.0000803876,1629597,Zylan Cheatham,SF,NOP,-2.66277,-0.419989,-3.08275
