## Data from 3 different DataSets:

<ol>
    <li>Totals : Basic Player's Stats from Season 1980-81 to 2019-2020</li>
        <ul>
            <li>e.g.: <a href="https://www.basketball-reference.com/leagues/NBA_2020_totals.html">Basketball Reference Season 2019-20 Player's Totals</a></li>
        </ul>
    <li>Advanced : Advanced Player's Stats from Season 1980-81 to 2019-2020</li>
        <ul>
            <li>e.g.: <a href="https://www.basketball-reference.com/leagues/NBA_2020_advanced.html">Basketball Reference Season 2019-20 Advanced Stats</a></li>
        </ul>
    <li>MVP Race : MVP Voting List from Season 1980-81 to 2019-2020</li>
        <ul>
            <li>e.g.: <a href="https://www.basketball-reference.com/awards/awards_2020.html">Basketball Reference Season 2019-20 MVP Race</a></li>
        </ul>
</ol>


All three datasets gonna be merged into one for the analysis.

### Libs

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Scrapping

In [2]:
def scrap(url, head_flag=0):
    
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)

    # use findALL() to get the column headers
    soup.findAll('tr', limit=2)[head_flag]
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[head_flag].findAll('th')]
    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]
    # headers

    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]

    df = pd.DataFrame(player_stats, columns = headers)
    
    return df

def get_teams(url):
    
    # collect HTML data
    html = urlopen(url)

    # create beautiful soup object from HTML
    soup = BeautifulSoup(html, features="lxml")

    # use getText()to extract the headers into a list
    titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    titles = list(titles)
    
    
    divisions = ['Atlantic Division', 'Central Division',
             'Southeast Division', 'Northwest Division',
             'Pacific Division', 'Southwest Division',
             'Midwest Division','W', r'W/L%', r'PS/G', r'SRS',r'L', r'GB', r'PA/G','Eastern Conference','Western Conference']
    teams = []
    
    for t in titles:
        if t not in divisions:
            teams.append(t.split(sep='*')[0])
            
    return teams

def scrap_teams(url, season,head_flag=0):
    
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)

    # use findALL() to get the column headers
    soup.findAll('tr', limit=2)[head_flag]
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[head_flag].findAll('th')]
    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]
    # headers

    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    
    if season > 2015:
        player_stats = player_stats[:32]

    df = pd.DataFrame(player_stats, columns = headers)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

    teams = get_teams(url)
    
    if season > 2015:
        teams = teams[:30]

    df['Teams'] = teams
    
    return df

In [3]:
for season in np.arange(1981,2021,1):
    
    url = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html".format(season)
    df_totals = scrap(url)
    df_totals.to_csv(f"./basketball_reference_dbs/{season}_totals.csv", index=False)
    
    url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(season)
    df_advanced = scrap(url)
    df_advanced.to_csv(f"./basketball_reference_dbs/{season}_advanced.csv", index=False)
    
    url = "https://www.basketball-reference.com/awards/awards_{}.html".format(season)
    df_mvp = scrap(url, head_flag=1)
    df_mvp = df_mvp.iloc[1:]
    df_mvp.to_csv(f"./basketball_reference_dbs/mvp/{season}_mvp.csv", index=False)

    url = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(season)
    df_teams = scrap_teams(url,season,head_flag=1)
    df_teams.to_csv(f"./basketball_reference_dbs/teams/{season}_teams.csv",index=False)
    
    print('{} success'.format(season))

1981 success
1982 success
1983 success
1984 success
1985 success
1986 success
1987 success
1988 success
1989 success
1990 success
1991 success
1992 success
1993 success
1994 success
1995 success
1996 success
1997 success
1998 success
1999 success
2000 success
2001 success
2002 success
2003 success
2004 success
2005 success
2006 success
2007 success
2008 success
2009 success
2010 success
2011 success
2012 success
2013 success
2014 success
2015 success
2016 success
2017 success
2018 success
2019 success
2020 success


## DataSets' Preview

### Totals

In [4]:
display(pd.read_csv(f"./basketball_reference_dbs/2020_totals.csv"))

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26.0,OKC,63.0,63.0,1680.0,283.0,478.0,0.592,...,0.582,207.0,376.0,583.0,146.0,51.0,67.0,94.0,122.0,684.0
1,Bam Adebayo,PF,22.0,MIA,72.0,72.0,2417.0,440.0,790.0,0.557,...,0.691,176.0,559.0,735.0,368.0,82.0,93.0,204.0,182.0,1146.0
2,LaMarcus Aldridge,C,34.0,SAS,53.0,53.0,1754.0,391.0,793.0,0.493,...,0.827,103.0,289.0,392.0,129.0,36.0,87.0,74.0,128.0,1001.0
3,Kyle Alexander,C,23.0,MIA,2.0,0.0,13.0,1.0,2.0,0.500,...,,2.0,1.0,3.0,0.0,0.0,0.0,1.0,1.0,2.0
4,Nickeil Alexander-Walker,SG,21.0,NOP,47.0,1.0,591.0,98.0,266.0,0.368,...,0.676,9.0,75.0,84.0,89.0,17.0,8.0,54.0,57.0,267.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,Trae Young,PG,21.0,ATL,60.0,60.0,2120.0,546.0,1249.0,0.437,...,0.860,32.0,223.0,255.0,560.0,65.0,8.0,289.0,104.0,1778.0
673,Cody Zeller,C,27.0,CHO,58.0,39.0,1341.0,251.0,479.0,0.524,...,0.682,160.0,251.0,411.0,88.0,40.0,25.0,75.0,140.0,642.0
674,Tyler Zeller,C,30.0,SAS,2.0,0.0,4.0,1.0,4.0,0.250,...,,3.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0
675,Ante Žižić,C,23.0,CLE,22.0,0.0,221.0,41.0,72.0,0.569,...,0.737,18.0,48.0,66.0,6.0,7.0,5.0,10.0,27.0,96.0


### Advanced

In [5]:
display(pd.read_csv(f"./basketball_reference_dbs/2020_advanced.csv"))

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,.1,OBPM,DBPM,BPM,VORP
0,Steven Adams,C,26.0,OKC,63.0,1680.0,20.5,0.604,0.006,0.421,...,,3.8,2.7,6.5,0.185,,1.9,1.1,2.9,2.1
1,Bam Adebayo,PF,22.0,MIA,72.0,2417.0,20.3,0.598,0.018,0.484,...,,4.6,3.9,8.5,0.168,,1.4,2.0,3.4,3.3
2,LaMarcus Aldridge,C,34.0,SAS,53.0,1754.0,19.7,0.571,0.198,0.241,...,,3.0,1.4,4.5,0.122,,1.8,-0.5,1.4,1.5
3,Kyle Alexander,C,23.0,MIA,2.0,13.0,4.7,0.500,0.000,0.000,...,,0.0,0.0,0.0,-0.003,,-6.1,-3.5,-9.6,0.0
4,Nickeil Alexander-Walker,SG,21.0,NOP,47.0,591.0,8.9,0.473,0.500,0.139,...,,-0.7,0.4,-0.2,-0.020,,-3.2,-1.4,-4.6,-0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,Trae Young,PG,21.0,ATL,60.0,2120.0,23.9,0.595,0.455,0.448,...,,5.3,0.6,5.9,0.133,,6.2,-2.3,3.9,3.1
673,Cody Zeller,C,27.0,CHO,58.0,1341.0,18.8,0.576,0.157,0.374,...,,2.3,1.3,3.6,0.129,,0.2,-0.8,-0.6,0.5
674,Tyler Zeller,C,30.0,SAS,2.0,4.0,22.4,0.250,0.000,0.000,...,,0.0,0.0,0.0,-0.075,,-0.3,-22.1,-22.4,0.0
675,Ante Žižić,C,23.0,CLE,22.0,221.0,16.4,0.597,0.000,0.264,...,,0.3,0.2,0.5,0.106,,-1.7,-1.5,-3.2,-0.1


### MVP

In [6]:
display(pd.read_csv(f"./basketball_reference_dbs/mvp/2020_mvp.csv"))

Unnamed: 0,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,Giannis Antetokounmpo,25.0,MIL,85.0,962.0,1010.0,0.952,63.0,30.4,29.5,13.6,5.6,1.0,1.0,0.553,0.304,0.633,11.1,0.279
1,LeBron James,35.0,LAL,16.0,753.0,1010.0,0.746,67.0,34.6,25.3,7.8,10.2,1.2,0.5,0.493,0.348,0.693,9.8,0.204
2,James Harden,30.0,HOU,0.0,367.0,1010.0,0.363,68.0,36.5,34.3,6.6,7.5,1.8,0.9,0.444,0.355,0.865,13.1,0.254
3,Luka Dončić,20.0,DAL,0.0,200.0,1010.0,0.198,61.0,33.6,28.8,9.4,8.8,1.0,0.2,0.463,0.316,0.758,8.8,0.207
4,Kawhi Leonard,28.0,LAC,0.0,168.0,1010.0,0.166,57.0,32.4,27.1,7.1,4.9,1.8,0.6,0.47,0.378,0.886,8.7,0.226
5,Anthony Davis,26.0,LAL,0.0,82.0,1010.0,0.081,62.0,34.4,26.1,9.3,3.2,1.5,2.3,0.503,0.33,0.846,11.1,0.25
6,Chris Paul,34.0,OKC,0.0,26.0,1010.0,0.026,70.0,31.5,17.6,5.0,6.7,1.6,0.2,0.489,0.365,0.907,8.9,0.193
7,Damian Lillard,29.0,POR,0.0,23.0,1010.0,0.023,66.0,37.5,30.0,4.3,8.0,1.1,0.3,0.463,0.401,0.888,11.6,0.225
8,Nikola Jokić,24.0,DEN,0.0,18.0,1010.0,0.018,73.0,32.0,19.9,9.7,7.0,1.2,0.6,0.528,0.314,0.817,9.8,0.202
9,Pascal Siakam,25.0,TOR,0.0,17.0,1010.0,0.017,60.0,35.2,22.9,7.3,3.5,1.0,0.9,0.453,0.359,0.792,5.4,0.123


### Teams

In [7]:
display(pd.read_csv(f"./basketball_reference_dbs/teams/2020_teams.csv"))

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Teams
0,56,17,0.767,—,118.7,108.6,9.41,Milwaukee Bucks
1,53,19,0.736,2.5,112.8,106.5,5.97,Toronto Raptors
2,48,24,0.667,7.5,113.7,107.3,5.83,Boston Celtics
3,45,28,0.616,11.0,109.4,107.5,1.63,Indiana Pacers
4,44,29,0.603,12.0,112.0,109.1,2.59,Miami Heat
5,43,30,0.589,13.0,110.7,108.4,2.25,Philadelphia 76ers
6,35,37,0.486,20.5,111.8,112.3,-1.01,Brooklyn Nets
7,33,40,0.452,23.0,107.3,108.3,-0.93,Orlando Magic
8,23,42,0.354,29.0,102.9,109.6,-7.03,Charlotte Hornets
9,25,47,0.347,30.5,114.4,119.1,-5.24,Washington Wizards
