# Appendix A - Scrape & Build NBA Salary Dataset
The goal of this notebook is to prepare our course with a pre-existing dataset. The data cleaning is done in the course itself; this is meant only to create the dataset. 

In [1]:
# %pip install requests requests-html matplotlib pandas

In [2]:
import datetime
from decimal import Decimal
import matplotlib.pyplot as plt
import requests
from requests_html import HTML
import pandas as pd
import pathlib
import time

In [3]:
PERFROM_SCRAPE = True
BASE_DIR = pathlib.Path().resolve().parent.parent
COURSES_DIR = BASE_DIR / 'course'
DATASET_PATH = COURSES_DIR / 'datasets'
OUTPUT_PATH = DATASET_PATH / 'nba-historical-salaries.csv'

In [6]:
COURSES_DIR.exists()

True

For this dataset, we use `hoopshype.com`'s record of player salaries.

In [3]:
base_url = 'https://hoopshype.com/salaries/players/'

`hoopshype.com`'s salary data starts in the 1990-1991 season.

In [4]:
year_start = 1990

End scraping at last year's season (this year might not be available).

In [5]:
year_end = datetime.datetime.now().year - 1
year_end

2020

In [7]:
dfs = []
if PERFROM_SCRAPE:
    for year in range(year_start, year_end+1):
        # NBA season spans 2 different calendar years
        year_range = f"{year}-{year+1}"
        # the lookup salary url is based on the above range
        url = f"{base_url}{year_range}/"
        # print year and url for manual review
        print(year, url)
        # perform lookup
        r = requests.get(url)
        # Convert response html text as a parsable object
        html = HTML(html=r.text)
        # Find the data table containing
        table = html.find('table', first=True)
        # table_data list holder
        table_data = []
        # iterate the table element and append all column values in each row
        for el in table.element.getchildren():
            for tr in el.getchildren():
                row_data = []
                for col in tr.getchildren():
                    row_data.append(col.text_content().strip())
                table_data.append(row_data)
        # create the initial dataframe
        init_df = pd.DataFrame(table_data)
        # use the first row as the header
        new_header = init_df.iloc[0]
        # use everything after the first row as our dataset
        init_df = init_df[1:]
        # update header
        init_df.columns = new_header

        # attempt to rename columns, if it's avaiable
        # otherwise, move to the next year lookup
        try:
            renamed_cols = {
                "Player": 'player',
                f"{new_header[2]}": "salary",
                f"{new_header[3]}": "adj_salary"
            }
            init_df = init_df.rename(columns=renamed_cols)
        except:
            continue

        # create 
        try:
            df = init_df.copy()[['player', 'salary', 'adj_salary']]
        except:
            continue
        # update dataset with year values 
        df['year-start'] = year
        df['year-end'] = year + 1
        # append this dataset to our group of datasets
        dfs.append(df)
        # slow down lookups to ensure our scraping doesn't overload
        # hoopshype.com
        time.sleep(1.2)

1990 https://hoopshype.com/salaries/players/1990-1991/
1991 https://hoopshype.com/salaries/players/1991-1992/
1992 https://hoopshype.com/salaries/players/1992-1993/
1993 https://hoopshype.com/salaries/players/1993-1994/
1994 https://hoopshype.com/salaries/players/1994-1995/
1995 https://hoopshype.com/salaries/players/1995-1996/
1996 https://hoopshype.com/salaries/players/1996-1997/
1997 https://hoopshype.com/salaries/players/1997-1998/
1998 https://hoopshype.com/salaries/players/1998-1999/
1999 https://hoopshype.com/salaries/players/1999-2000/
2000 https://hoopshype.com/salaries/players/2000-2001/
2001 https://hoopshype.com/salaries/players/2001-2002/
2002 https://hoopshype.com/salaries/players/2002-2003/
2003 https://hoopshype.com/salaries/players/2003-2004/
2004 https://hoopshype.com/salaries/players/2004-2005/
2005 https://hoopshype.com/salaries/players/2005-2006/
2006 https://hoopshype.com/salaries/players/2006-2007/
2007 https://hoopshype.com/salaries/players/2007-2008/
2008 https

Convert our list of dataframes (ie season salaries) into our entire dataset via pandas concat.

In [12]:
dataset_df = pd.concat(dfs) #[['player', 'year-start', 'year-end', 'salary', 'adj_salary']]
dataset_df.reset_index(drop=True, inplace=True)
dataset_df.shape

(14549, 5)

Store file to our course data

In [14]:
dataset_df.to_csv(OUTPUT_PATH, index=False)