In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from urllib.request import urlopen
import time

import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
years = list(range(2004, 2022))

## Functions

In [4]:
def get_NBA_year_stats(year):

    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"

    r = requests.get(url)
    r_html = r.text
    soup = BeautifulSoup(r_html,'html.parser')
    table=soup.find_all(class_="full_table")
    
    
    # Extracting List of column names
    head=soup.find(class_="thead")
    column_names_raw=[head.text for item in head][0]
    column_names_polished=column_names_raw.replace("\n",",").split(",")[2:-1]
    
    
    players=[]
    
    for i in range(len(table)):

        player_=[]

        for td in table[i].find_all("td"):
            player_.append(td.text)

        players.append(player_)


    df=pd.DataFrame(players, columns=column_names_polished)
    #cleaning the player's name from occasional special characters
    df.Player=df.Player.str.replace('*', '')
    
    df = df[['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'OWS', 'DWS', 'WS', 'WS/48', 'VORP']]
    df.Age = df.Age.astype(float)
    df.G = df.G.astype(float)
    df.MP = df.MP.astype(float)
    df.OWS = df.OWS.astype(float)
    df.DWS = df.DWS.astype(float)
    df.WS = df.WS.astype(float)
    df['WS/48'] = df['WS/48']
    df.VORP = df.VORP
    
    return df





In [5]:
def scrape_draft_data(year):
    
    draft_columns=['Pk', 'Tm', 'Player', 'College', 'Yrs', 'G', 
                   'MP', 'PTS', 'TRB', 'AST', 'FG%', '3P%', 'FT%', 
                   'MP', 'PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP']
    final_df=pd.DataFrame(columns=draft_columns)
    

    url = f"https://www.basketball-reference.com/draft/NBA_{year}.html"
    html = urlopen(url)
    soup= BeautifulSoup(html, features = 'lxml')
    # get rows from table
    rows = soup.findAll('tr')[0:]
    rows_data = [[td.getText() for td in rows[i].findAll('td')]
                 for i in range(len(rows))]
    #placing data in a DataFrame
    df_1 = pd.DataFrame(rows_data,columns=draft_columns)

    final_df=pd.concat([final_df,df_1])
    final_df.dropna(axis = 0, subset = ['Player'], inplace = True)
    final_df['Draft_Year'] = year
    return final_df

In [6]:
def get_NBA_year_salary(year):
    year2 = year+1
    url = f"https://hoopshype.com/salaries/players/{year}-{year2}/"
    html = urlopen(url)
    soup= BeautifulSoup(html, features = 'lxml')
    # get rows from table
    rows = soup.findAll('tr')[0:]
    rows_data = [[td.getText().strip() for td in rows[i].findAll('td')]
                 for i in range(len(rows))]
    headers = rows_data.pop(0)
    df = pd.DataFrame(rows_data, columns=headers)
    df.drop(columns=df.columns[[0, 3]], axis=1,  inplace=True)
    df.columns = ['Player', 'Salary']
    df['Salary'] = df['Salary'].str.replace(r'\$', '')
    df['Salary'] = df['Salary'].str.replace(r',', '')
    df['Salary'] = df['Salary'].astype(float)
    df['Year'] = year
    return df

In [7]:
all_data = pd.DataFrame()
draft_data = pd.DataFrame()
for i in years:
    print(i)
    data = get_NBA_year_stats(i).merge(get_NBA_year_salary(i), 
                                       how = 'outer', on = 'Player')
    draft = scrape_draft_data(i)
    all_data = pd.concat([all_data, data], ignore_index= True)
    draft_data = pd.concat([draft_data, draft], ignore_index= True)
    time.sleep(60)

all_data = all_data.merge(draft_data, how = 'outer', on = 'Player')

2004


  


2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [8]:
all_data

Unnamed: 0,Player,Pos,Age,Tm_x,G_x,MP_x,OWS,DWS,WS_x,WS/48_x,VORP_x,Salary,Year,Pk,Tm_y,College,Yrs,G_y,MP_y,PTS,TRB,AST,FG%,3P%,FT%,MP_y.1,PTS.1,TRB.1,AST.1,WS_y,WS/48_y,BPM,VORP_y,Draft_Year
0,Shareef Abdur-Rahim,PF,27.0,TOT,85.0,2684.0,6.1,2.0,8.1,.145,2.4,14625000.0,2004.0,,,,,,,,,,,,,,,,,,,,,
1,Shareef Abdur-Rahim,PF,28.0,POR,54.0,1867.0,3.9,1.5,5.4,.139,1.4,5000000.0,2005.0,,,,,,,,,,,,,,,,,,,,,
2,Shareef Abdur-Rahim,PF,29.0,SAC,72.0,1961.0,4.0,2.2,6.2,.152,1.4,5400000.0,2006.0,,,,,,,,,,,,,,,,,,,,,
3,Shareef Abdur-Rahim,C,30.0,SAC,80.0,2015.0,1.1,1.7,2.8,.067,-0.2,5800000.0,2007.0,,,,,,,,,,,,,,,,,,,,,
4,Shareef Abdur-Rahim,PF,31.0,SAC,6.0,51.0,0.0,0.0,0.0,-0.005,-0.1,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11503,Marcus Zegarowski,,,,,,,,,,,,,49,BRK,Creighton,,,,,,,,,,,,,,,,,,2021.0
11504,Filip Petrušev,,,,,,,,,,,,,50,PHI,Gonzaga,,,,,,,,,,,,,,,,,,2021.0
11505,Brandon Boston Jr.,,,,,,,,,,,,,51,MEM,Kentucky,2,73,1008,486,141,71,.394,.329,.800,13.8,6.7,1.9,1.0,0.6,.030,-3.0,-0.3,2021.0
11506,Balša Koprivica,,,,,,,,,,,,,57,CHO,Florida State,,,,,,,,,,,,,,,,,,2021.0


In [9]:
all_data.dropna(subset=['Salary','Year']).reset_index(drop=True)

Unnamed: 0,Player,Pos,Age,Tm_x,G_x,MP_x,OWS,DWS,WS_x,WS/48_x,VORP_x,Salary,Year,Pk,Tm_y,College,Yrs,G_y,MP_y,PTS,TRB,AST,FG%,3P%,FT%,MP_y.1,PTS.1,TRB.1,AST.1,WS_y,WS/48_y,BPM,VORP_y,Draft_Year
0,Shareef Abdur-Rahim,PF,27.0,TOT,85.0,2684.0,6.1,2.0,8.1,.145,2.4,14625000.0,2004.0,,,,,,,,,,,,,,,,,,,,,
1,Shareef Abdur-Rahim,PF,28.0,POR,54.0,1867.0,3.9,1.5,5.4,.139,1.4,5000000.0,2005.0,,,,,,,,,,,,,,,,,,,,,
2,Shareef Abdur-Rahim,PF,29.0,SAC,72.0,1961.0,4.0,2.2,6.2,.152,1.4,5400000.0,2006.0,,,,,,,,,,,,,,,,,,,,,
3,Shareef Abdur-Rahim,C,30.0,SAC,80.0,2015.0,1.1,1.7,2.8,.067,-0.2,5800000.0,2007.0,,,,,,,,,,,,,,,,,,,,,
4,Shareef Abdur-Rahim,,,,,,,,,,,6600000.0,2009.0,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9221,Jordan Schakel,,,,,,,,,,,53176.0,2021.0,,,,,,,,,,,,,,,,,,,,,
9222,Jordan Goodwin,,,,,,,,,,,53176.0,2021.0,,,,,,,,,,,,,,,,,,,,,
9223,Craig Sword,,,,,,,,,,,53176.0,2021.0,,,,,,,,,,,,,,,,,,,,,
9224,Jaime Echenique,,,,,,,,,,,53176.0,2021.0,,,,,,,,,,,,,,,,,,,,,


In [10]:
9226/18

512.5555555555555

In [None]:
# http://bkref.com/pi/shareit/jjQt1