# Data Scraping NBA (Player) Salary Data from espn.com
#### Author: Ethan Eason | Project completed as work for TrueHoop CEO and Founder Henry Abbott | June 2022

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Some constants and scraping functions first.

In [2]:
# We start at page 0 so that iterating over pages is more elegant (see below)
ROOT_URL = "http://www.espn.com/nba/salaries/_/page/0/season_type/3"

# We retrieve salary data going YEARS_BACK seasons into the past (including LATEST_SEASON).
LATEST_YEAR = 2022
YEARS_BACK = 10
EARLIEST_YEAR = LATEST_YEAR - (YEARS_BACK - 1)

SALARY_IDENTIFIER = "$"
ENDSALARY_IDENTIFIER = "</"
PRENAME_IDENTIFIER = "/id/"
NAME_IDENTIFIER = ">"
ENDNAME_IDENTIFIER = "<"

TYPES = {"season" : int, "player_name" : str, "salary" : float}

def build_url(year):
    return ROOT_URL + "/year/" + str(year)

def is_valid_response(parsed_response):
    page_numbers_str = str(parsed_response.find_all("td"))
    return True if SALARY_IDENTIFIER in page_numbers_str else False

def incr_page_number(url, cur_page_number):
    cur_page_number_str = str(cur_page_number)
    cur_page_number_digits = len(cur_page_number_str)
    next_page_number_str = str(cur_page_number + 1)
    page_number_index = url.index(cur_page_number_str)
    return url[:page_number_index] + next_page_number_str + url[(page_number_index + cur_page_number_digits):]
    
def get_next_page(url, cur_page_number):
    next_page_url = incr_page_number(url, cur_page_number)
    page_response = requests.get(url).text
    parsed_response = BeautifulSoup(page_response, "html.parser")
    page_nonempty = is_valid_response(parsed_response)
    return (next_page_url, parsed_response) if page_nonempty else (None, None)

def extract_salaries(parsed_response, year):
    page_tables = parsed_response.find_all("td")
    players = []
    salaries = []
    item_idx = 0
    
    while (item_idx < len(page_tables)):
        item_str = str(page_tables[item_idx])
        if PRENAME_IDENTIFIER in item_str:
            # Player name extraction
            prename_idx = item_str.index(PRENAME_IDENTIFIER)
            trimmed_item_str = item_str[prename_idx:]
            startname_idx = trimmed_item_str.index(NAME_IDENTIFIER)
            trimmed_item_str = trimmed_item_str[startname_idx:]
            endname_idx = trimmed_item_str.index(ENDNAME_IDENTIFIER)
            player_name = trimmed_item_str[1:endname_idx]
            players.append(player_name)
            
            # Salary extraction (salary items are two indices ahead of player name items)
            item_idx += 2
            salary_item_str = str(page_tables[item_idx])
            startsalary_idx = salary_item_str.index(SALARY_IDENTIFIER)
            endsalary_idx = salary_item_str.index(ENDSALARY_IDENTIFIER)
            trimmed_salary_item_str = salary_item_str[(startsalary_idx + 1):endsalary_idx]
            player_salary = int(trimmed_salary_item_str.replace(",", ""))
            salaries.append(player_salary)
        else:    
            item_idx += 1
            
    cur_page_salaries_df = pd.DataFrame({"season" : ([year] * len(players)), 
                                         "player_name" : players, 
                                         "salary" : salaries})
    return cur_page_salaries_df.astype(TYPES)

### Now we scrape the salary data we want into a dataframe. 

In [3]:
salaries_df = pd.DataFrame({"season" : [], "player_name" : [], "salary" : []})
salaries_df = salaries_df.astype(TYPES)
for year in range(EARLIEST_YEAR, (LATEST_YEAR + 1)):
    year_url = build_url(year)
    cur_page_number = 0
    while True:
        (next_page_url, year_parsed_response) = get_next_page(year_url, cur_page_number)
        if year_parsed_response is None:
            break
        cur_salary_df = extract_salaries(year_parsed_response, year)
        salaries_df = pd.concat([salaries_df, cur_salary_df], axis=0)
        cur_page_number += 1
        year_url = next_page_url
salaries_df

Unnamed: 0,season,player_name,salary
0,2013,Kobe Bryant,30453805.0
1,2013,Dirk Nowitzki,20907128.0
2,2013,Amar'e Stoudemire,19948799.0
3,2013,Joe Johnson,19752645.0
4,2013,Carmelo Anthony,19444503.0
...,...,...,...
16,2022,Juwan Morgan,19816.0
17,2022,Moses Brown,19186.0
18,2022,Xavier Sneed,8558.0
19,2022,Ish Wainright,5318.0


### Next, we read in the base data set we will augment.

In [4]:
base_df = pd.read_csv("/Users/ethaneason/college-projects/nba_salary_scraper/base_sheet.csv")
base_df = base_df.astype({"season" : int, "player_name" : str})
base_df

Unnamed: 0,season,ewins,nba_id,player_name,pos,team,oepm,depm,epm
0,2015,4.401080,201166,Aaron Brooks,PG,CHI,1.091550,-0.604692,0.48686
1,2014,1.513680,201166,Aaron Brooks,PG,DEN,-0.326643,-1.232800,-1.55944
2,2016,1.396600,201166,Aaron Brooks,PG,CHI,-0.540226,-0.551815,-1.09204
3,2013,0.782241,201166,Aaron Brooks,PG,HOU,-0.291699,-1.542300,-1.83400
4,2017,0.182092,201166,Aaron Brooks,PG,IND,-1.828710,-0.821934,-2.65065
...,...,...,...,...,...,...,...,...,...
5297,2020,2.073350,1629627,Zion Williamson,PF,NOP,2.400890,-0.788767,1.61213
5298,2022,,1629627,Zion Williamson,,NOP,,,
5299,2015,-0.013630,204054,Zoran Dragic,SG,MIA,-2.285350,-0.923961,-3.20931
5300,2022,-0.000080,1629597,Zylan Cheatham,SF,NOP,-2.662770,-0.419989,-3.08275


### Then we inject the scraped salary data into the base dataframe.

In [5]:
merge_cols = ['season', 'player_name']
base_with_salary_df = pd.merge(base_df, salaries_df, how='left', left_on=merge_cols, right_on=merge_cols)
base_with_salary_df

Unnamed: 0,season,ewins,nba_id,player_name,pos,team,oepm,depm,epm,salary
0,2015,4.401080,201166,Aaron Brooks,PG,CHI,1.091550,-0.604692,0.48686,915243.0
1,2014,1.513680,201166,Aaron Brooks,PG,DEN,-0.326643,-1.232800,-1.55944,884293.0
2,2016,1.396600,201166,Aaron Brooks,PG,CHI,-0.540226,-0.551815,-1.09204,2250000.0
3,2013,0.782241,201166,Aaron Brooks,PG,HOU,-0.291699,-1.542300,-1.83400,2400000.0
4,2017,0.182092,201166,Aaron Brooks,PG,IND,-1.828710,-0.821934,-2.65065,2700000.0
...,...,...,...,...,...,...,...,...,...,...
5696,2020,2.073350,1629627,Zion Williamson,PF,NOP,2.400890,-0.788767,1.61213,9757440.0
5697,2022,,1629627,Zion Williamson,,NOP,,,,10733400.0
5698,2015,-0.013630,204054,Zoran Dragic,SG,MIA,-2.285350,-0.923961,-3.20931,1706225.0
5699,2022,-0.000080,1629597,Zylan Cheatham,SF,NOP,-2.662770,-0.419989,-3.08275,


### Let's check to see how many rows are missing salaries. 

In [8]:
salaries_missing = base_with_salary_df['salary'].isna().sum()
print("Raw count: " + str(salaries_missing))
print("Percentage: " + str(salaries_missing / len(base_with_salary_df.index)))

Raw count: 684
Percentage: 0.11997895106121734


### Lastly, we write the augmented data to a csv.

In [7]:
base_with_salary_df.to_csv("/Users/ethaneason/college-projects/nba_salary_scraper/base_sheet_with_salaries.csv",
                           index=False)