# 1 Purpose

[Salary Cap History](https://www.basketball-reference.com/contracts/salary-cap-history.html "Basketball Reference")

### 1.0.1 Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# 2 Salary Cap History

### 2.0.1 From csv

In [3]:
salary_cap = pd.read_csv('../data/interim/sportsref_download.csv', sep = ',')

salary_cap.head()

Unnamed: 0,year,salary_cap,2015_dollars
0,1984,"$3,600,000","$7,934,034"
1,1985,"$4,233,000","$9,153,509"
2,1986,"$4,945,000","$10,317,292"
3,1987,"$6,164,000","$12,354,015"
4,1988,"$7,232,000","$13,829,137"


## 2.1 Cleaning

Only cleaning that needs to be done here is removing the first character from the the 2nd and 3rd columns before converting the data types to floats. We won't be accounting for inflation, so we can actually just drop the 3rd column entirely.

In [4]:
cap_clean = salary_cap.drop(columns = '2015_dollars')

cap_clean.salary_cap = cap_clean.salary_cap.str[1:]
cap_clean.salary_cap = pd.to_numeric(cap_clean.salary_cap.str.replace(',',''))
cap_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 2 columns):
year          37 non-null int64
salary_cap    37 non-null int64
dtypes: int64(2)
memory usage: 672.0 bytes


In [11]:
file_loc = '../data/interim/salary_cap_history.csv'

cap_clean.to_csv(file_loc)

# 3 Player Salaries

### 3.0.1 Imports

In [6]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

## 3.1 Webscrape

[Player Salaries](https://hoopshype.com/salaries/players/2018-2019)

In [7]:
money_dfs = []

for i in tqdm(range(2000,2020)):

    # URL page we will scraping
    url = "https://hoopshype.com/salaries/players/{}-{}/".format(i, i+1)
   
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)

    # use findALL() to get the column headers
    soup.findAll('table', limit=2)[0]

    rows = soup.findAll('tr')[1:]

    salaries = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

    step_two = [i[1] for i in salaries]

    step_three = [i.replace('\n\n\t\t\t\t\t\t\t\t','') for i in step_two]
    names = [i.replace('\t\t\t\t\t\t\t\n','') for i in step_three]

    step_two = [i[2] for i in salaries]

    step_three = [i.replace('\n\t\t\t\t\t\t\t$','') for i in step_two]
    step_four = [i.replace('\t\t\t\t\t\t','') for i in step_three]
    money = list(pd.to_numeric([i.replace(',','') for i in step_four]))

    df = pd.DataFrame({'name': names,
                        'salary': money,
                        'year': i})
    money_dfs.append(df)

final = pd.concat(money_dfs)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:48<00:00,  1.27s/it]


In [8]:
final.shape

(9267, 3)

In [9]:
file_loc = '../data/interim/salaries.csv'

final.to_csv(file_loc)

# 4 Team Spending

In [10]:
spending_dfs = []

for i in tqdm(range(2000,2019)):

    # URL page we will scraping
    url = "https://hoopshype.com/salaries/{}-{}/".format(i, i+1)
   
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)

    # use findALL() to get the column headers
    soup.findAll('table', limit=2)[0]

    rows = soup.findAll('tr')[1:]

    payrolls = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

    step_two = [i[1] for i in payrolls]

    step_three = [i.replace('\n\n\t\t\t\t\t\t\t\t\t','') for i in step_two]
    Team = [i.replace('\t\t\t\t\t\t\t\t\n','') for i in step_three]

    step_two = [i[2] if len(i) > 2 else '\n\t\t\t\t\t\t\t\t\t$\t\t\t\t\t\t\t\t' for i in payrolls]
    step_three = [i.replace('\n\t\t\t\t\t\t\t\t\t$','') for i in step_two]
    step_four = [i.replace('\t\t\t\t\t\t\t\t','') for i in step_three]
    payroll = list(pd.to_numeric([i.replace(',','') for i in step_four]))
    df = pd.DataFrame({'Team': Team,
                        'Payroll': payroll,
                        'year': i})
    spending_dfs.append(df)

spending = pd.concat(spending_dfs)

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:18<00:00,  1.20it/s]


## 4.1 Wrangling and Cleaning

In [12]:
spend_df = pd.merge(spending, cap_clean, on = 'year')

In [13]:
spend_df['portion'] = round(spend_df.Payroll/spend_df.salary_cap,2)

In [14]:
spend_df['Tm'] = spend_df.Team.map({'Portland':'POR',
                                    'New York':'NYK',
                                    'Miami':'MIA',
                                    'Brooklyn':'BRK',
                                    'Washington':'WAS',
                                    'LA Lakers':'LAL',
                                    'Milwaukee':'MIL',
                                    'San Antonio':'SAS',
                                    'Indiana':'IND',
                                    'Phoenix':'PHO',
                                    'Utah':'UTA',
                                    'Dallas':'DAL',
                                    'Denver':'DEN',
                                    'Oklahoma City':'OKC',
                                    'Boston':'BOS',
                                    'Philadelphia':'PHI',
                                    'Cleveland':'CLE',
                                    'Houston':'HOU',
                                    'Memphis':'MEM',
                                    'Minnesota':'MIN',
                                    'Charlotte':'CHH',
                                    'Sacramento':'SAC',
                                    'Golden State':'GSW',
                                    'Detroit':'DET',
                                    'Atlanta':'ATL',
                                    'Toronto':'TOR',
                                    'Orlando':'ORL',
                                    'Chicago':'CHI',
                                    'LA Clippers':'LAC',
                                    'New Orleans':'NOH'})

In [15]:
spend_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 570 entries, 0 to 569
Data columns (total 6 columns):
Team          570 non-null object
Payroll       566 non-null float64
year          570 non-null int64
salary_cap    570 non-null int64
portion       566 non-null float64
Tm            570 non-null object
dtypes: float64(2), int64(2), object(2)
memory usage: 31.2+ KB


In [16]:
file_loc = '../data/processed/spend_history.csv'

spend_df.to_csv(file_loc)