In [1]:
import re
import pandas as pd
from unidecode import unidecode
from tqdm.notebook import tqdm
from datetime import datetime
from collections import Counter
pd.options.display.max_columns = 30

### Reading tables from Basketball Reference

The following parses for data from Basketball-reference.com for PerGrame and Advance Stats. The nature of the table has blank columns and rows that repeats itself multiple time. To clean the data, the following is performed.

* Remove columns with column name unnamed.
* Remove rows where entries to column Player is Player.
* Certain Names have latin characters which will make it hard to connect basketball reference data to NBA.com data. Convert the Unicode to ascii.
* Replace % to perc in column name.
* Add a season column to it.

In [2]:
# advance and per game stats
years = ['2014-15', '2015-16', '2016-2017', '2017-18', '2018-19', '2019-20', '2020-21']
for stat in tqdm(['per_game', 'advanced']):
    first_flag = True
    for idx, year in enumerate(['2015', '2016', '2017', '2018', '2019', '2020', '2021']):
        # read in the html table from link
        df = pd.read_html(f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html")[0]
        # cleaning the data
        drop_cols = [col for col in list(df.columns) if re.search('unnamed', col.lower())]
        df = df.drop(columns=drop_cols)
        cols = [re.sub('%','_perc', col) for col in list(df.columns)]
        df.columns = cols
        df['Player'] = df.Player.apply(unidecode)
        df['Season'] = years[idx]
        if first_flag:
            df.to_csv(f"{stat}.csv", index=False, mode='w')
            first_flag = False
        else:
            df.to_csv(f"{stat}.csv", index=False, header=False, mode='a')

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




### Schedule games

In [2]:
def remove_dow(val):
    return ','.join(val.split(',')[1:])

def dow_extract(val):
    return val.split(',')[0]

def mdy_to_ymd(d):
    return datetime.strptime(d, '%b %d, %Y').strftime('%m/%d/%Y')

In [9]:
first_flag=True
for month in tqdm(['december', 'january', 'february', 'march', 'april', 'may']):
    url = f'https://www.basketball-reference.com/leagues/NBA_2021_games-{month}.html'
    df = pd.read_html(url)[0]
    df.columns = [col.lower() for col in df.columns]
    drop_cols = [col for col in list(df.columns) if re.search('unnamed|\s', col.lower())]
    drop_cols += ['notes', 'attend.']
    df = df.drop(columns=drop_cols)
    df.rename(columns={'visitor/neutral':'visitor', 'home/neutral':'home', 'pts':'vistor_pts', 'pts.1':'home_pts'}, inplace=True)
    df['dow'] = df.date.apply(dow_extract)
    df['date'] = df.date.apply(remove_dow)
    df['date'] = df.date.str.lstrip()
    df['date'] = df.date.apply(mdy_to_ymd)
    if first_flag:
        df.to_csv(f"2021_schedule.csv", index=False, mode='w')
        first_flag = False
    else:
        df.to_csv(f"2021_schedule.csv", index=False, header=False, mode='a')

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [14]:
df_t = pd.read_csv('2021_schedule.csv')
temp = df_t[['dow', 'date']].loc[(df_t['dow'] == 'Sun')].drop_duplicates().reset_index()
for idx, _ in temp.iterrows():
    temp.loc[idx, 'week'] = f'Week {idx+1}'
df_t = df_t.merge(temp[['date', 'week']], on='date', how='left')
df_t['week'] = df_t.week.fillna(method='bfill')
df_t.to_csv('final_2021_schedule.csv', index=False)

Unnamed: 0,date,visitor,vistor_pts,home,home_pts,dow,week
0,12/22/2020,Golden State Warriors,99.0,Brooklyn Nets,125.0,Tue,Week 1
1,12/22/2020,Los Angeles Clippers,116.0,Los Angeles Lakers,109.0,Tue,Week 1
2,12/23/2020,Charlotte Hornets,114.0,Cleveland Cavaliers,121.0,Wed,Week 1
3,12/23/2020,New York Knicks,107.0,Indiana Pacers,121.0,Wed,Week 1
4,12/23/2020,Miami Heat,107.0,Orlando Magic,113.0,Wed,Week 1


### bar chart

In [34]:
visitor = Counter(df_t.loc[(df_t.week == 'Week 1')].visitor.values)
home = Counter(df_t.loc[(df_t.week == 'Week 1')].home.values)
game_count = home + visitor

In [39]:
data = pd.DataFrame({'Team': list(game_count.keys()), 'n_games': list(game_count.values())})

In [40]:
import plotly.express as px

In [50]:
 color_discrete_map={"Brooklyn Nets": '#000000', "Los Angeles Lakers": '#552781',"Cleveland Cavaliers": '#6F263D',
                    "Indiana Pacers": '#041F43', "Orlando Magic": '#287DC5', "Philadelphia 76ers": '#1560BD',
                    "Toronto Raptors": '#B52F25', 'Boston Celtics': '#3D7B35', 'Chicago Bulls': '#D5392E',
                    "Memphis Grizzlies": '#05274A', "Minnesota Timberwolves": '#236193', 'Denver Nuggets': '#0F2340',
                    'Portland Trail Blazers': '#000000', 'Phoenix Suns': '#1F1861', 'Miami Heat': '#000000',
                    'Milwaukee Bucks': '#2D5234', 'Charlotte Hornets': '#3B8DAA', 'Detroit Pistons': '#0C519A',
                    'Washington Wizards': '#C73531', 'New York Knicks': '#0C54A0', 'San Antonio Spurs': '#000000',
                    'Utah Jazz': '#00275E', 'Sacramento Kings': '#393997', 'Los Angeles Clippers': '#D73932',
                    'New Orleans Pelicans': '#0C2340', 'Golden State Warriors': '#0D529C', 'Alanta Hawks':'#DD3C3D',
                    'Dallas Mavericks': '#0157B8', 'Oklahoma City Thunder': '#297CC2', 'Houston Rockets': '#DA3A2F'}

In [59]:
fig = px.bar(data, x='n_games', y='Team', orientation='h', color='Team',
             color_discrete_map=color_discrete_map)
fig.update_layout(width=1000, height=800, xaxis=dict(title_text='Number of Games',
                                                     tickvals=[1, 2, 3, 4]))
fig.show()

### NBA Standing

In [103]:
url = 'https://www.basketball-reference.com/leagues/NBA_2021_standings.html'
tables = pd.read_html(url)

In [104]:
east = tables[0]
east.rename(columns={'Eastern Conference': 'Team'}, inplace=True)
east['Conference'] = 'East'
west = tables[1]
west.rename(columns={'Western Conference': 'Team'}, inplace=True)
west['Conference'] = 'West'

In [105]:
def conference_seed_extract(val):
    return val.split()[-1].strip('()')
def remove_seed(val):
    return ' '.join(val.split()[:-1])

In [106]:
standing = pd.concat([tables[0], tables[1]])

In [107]:
standing['Rank'] = standing['Team'].apply(conference_seed_extract)
standing['Team'] = standing['Team'].apply(remove_seed)

In [108]:
standing

Unnamed: 0,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Conference,Rank
0,Philadelphia 76ers,39,17,0.696,—,113.7,108.4,5.26,East,1
1,Brooklyn Nets,38,18,0.679,1.0,119.0,114.3,4.18,East,2
2,Milwaukee Bucks,35,20,0.636,3.5,119.2,112.7,6.22,East,3
3,Atlanta Hawks,30,26,0.536,9.0,113.2,111.3,1.91,East,4
4,Boston Celtics,30,26,0.536,9.0,112.6,110.5,2.4,East,5
5,New York Knicks,30,27,0.526,9.5,105.8,104.3,0.99,East,6
6,Miami Heat,28,28,0.5,11.0,106.3,107.7,-1.23,East,7
7,Charlotte Hornets,27,28,0.491,11.5,110.3,111.7,-1.32,East,8
8,Indiana Pacers,26,29,0.473,12.5,113.8,114.2,-0.01,East,9
9,Toronto Raptors,23,34,0.404,16.5,112.0,111.4,0.09,East,10


In [109]:
standing.to_csv('2021_standings.csv', index=False)

In [97]:
merged = data.merge(standing, on='Team', how='inner')

In [100]:
px.scatter(merged, x='n_games', y='SRS', color='Team', color_discrete_map=color_discrete_map)

In [102]:
standing

Unnamed: 0,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,conference,Rank
0,Philadelphia 76ers,39,17,0.696,—,113.7,108.4,5.26,East,1
1,Brooklyn Nets,38,18,0.679,1.0,119.0,114.3,4.18,East,2
2,Milwaukee Bucks,35,20,0.636,3.5,119.2,112.7,6.22,East,3
3,Atlanta Hawks,30,26,0.536,9.0,113.2,111.3,1.91,East,4
4,Boston Celtics,30,26,0.536,9.0,112.6,110.5,2.4,East,5
5,New York Knicks,30,27,0.526,9.5,105.8,104.3,0.99,East,6
6,Miami Heat,28,28,0.5,11.0,106.3,107.7,-1.23,East,7
7,Charlotte Hornets,27,28,0.491,11.5,110.3,111.7,-1.32,East,8
8,Indiana Pacers,26,29,0.473,12.5,113.8,114.2,-0.01,East,9
9,Toronto Raptors,23,34,0.404,16.5,112.0,111.4,0.09,East,10
