In [1]:
import re
import pandas as pd
from unidecode import unidecode
from tqdm.notebook import tqdm
from datetime import datetime
from collections import Counter
import plotly.express as px
pd.options.display.max_columns = 30

### Reading tables from Basketball Reference

The following parses for data from Basketball-reference.com for PerGrame and Advance Stats. The nature of the table has blank columns and rows that repeats itself multiple time. To clean the data, the following is performed.

* Remove columns with column name unnamed.
* Remove rows where entries to column Player is Player.
* Certain Names have latin characters which will make it hard to connect basketball reference data to NBA.com data. Convert the Unicode to ascii.
* Replace % to perc in column name.
* Add a season column to it.

In [2]:
# advance and per game stats
years = ['2014-15', '2015-16', '2016-2017', '2017-18', '2018-19', '2019-20', '2020-21']
for stat in tqdm(['per_game', 'advanced']):
    first_flag = True
    for idx, year in enumerate(['2015', '2016', '2017', '2018', '2019', '2020', '2021']):
        # read in the html table from link
        df = pd.read_html(f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html")[0]
        # cleaning the data
        drop_cols = [col for col in list(df.columns) if re.search('unnamed', col.lower())]
        df = df.drop(columns=drop_cols)
        cols = [re.sub('%','_perc', col) for col in list(df.columns)]
        df.columns = cols
        df['Player'] = df.Player.apply(unidecode)
        df['Season'] = years[idx]
        if first_flag:
            df.to_csv(f"{stat}.csv", index=False, mode='w')
            first_flag = False
        else:
            df.to_csv(f"{stat}.csv", index=False, header=False, mode='a')

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




### Schedule games

In [20]:
def remove_dow(val):
    return ','.join(val.split(',')[1:])

def dow_extract(val):
    return val.split(',')[0]

def mdy_to_ymd(d):
    return datetime.strptime(d, '%b %d, %Y').strftime('%m/%d/%Y')

In [151]:
data = pd.DataFrame()
for month in tqdm(['december', 'january', 'february', 'march', 'april', 'may']):
    url = f'https://www.basketball-reference.com/leagues/NBA_2021_games-{month}.html'
    df = pd.read_html(url)[0]
    df.columns = [col.lower() for col in df.columns]
    drop_cols = [col for col in list(df.columns) if re.search('unnamed|\s', col.lower())]
    drop_cols += ['notes', 'attend.']
    df = df.drop(columns=drop_cols)
    df.rename(columns={'visitor/neutral':'visitor', 'home/neutral':'home', 'pts':'vistor_pts', 'pts.1':'home_pts'}, inplace=True)
    df['dow'] = df.date.apply(dow_extract)
    df['date'] = df.date.apply(remove_dow)
    df['date'] = df.date.str.lstrip()
    df['date'] = df.date.apply(mdy_to_ymd)
    data = pd.concat([data, df])
    
temp = data[['dow', 'date']].loc[(data['dow'] == 'Sun')].drop_duplicates().reset_index()
for idx, _ in temp.iterrows():
    temp.loc[idx, 'week'] = f'Week {idx+1}'
data = data.merge(temp[['date', 'week']], on='date', how='left')
data['week'] = data.week.fillna(method='bfill')
data.to_csv('2021_schedule.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [89]:
temp = df_t.loc[(df_t.week == 'Week 17')]
t1 = temp[['visitor', 'home']]
t2 = temp[['home', 'visitor']].rename(columns={'home':'visitor', 'visitor':'home'})
test = pd.concat([t1,t2])

In [147]:
standing['diff'] = standing['PS/G'] - standing['PA/G']
merged = test.merge(standing[['Team', 'diff']], left_on='home', right_on='Team', how='inner')
merged = merged[['visitor', 'Team','diff']].groupby(by='visitor').agg({'Team':'count', 'diff':'mean'})
merged['Team'] = merged['Team'].astype('str')
merged = merged.reset_index().sort_values('visitor')
merged['visitor'] = merged['visitor'].astype('category')

In [116]:
color_discrete_map={"Brooklyn Nets": '#000000', "Los Angeles Lakers": '#552781',"Cleveland Cavaliers": '#6F263D',
                "Indiana Pacers": '#F6BA33', "Orlando Magic": '#287DC5', "Philadelphia 76ers": '#1560BD',
                "Toronto Raptors": '#B52F25', 'Boston Celtics': '#55AA62', 'Chicago Bulls': '#D5392E',
                "Memphis Grizzlies": '#05274A', "Minnesota Timberwolves": '#236193', 'Denver Nuggets': '#F7C133',
                'Portland Trail Blazers': '#000000', 'Phoenix Suns': '#1F1861', 'Miami Heat': '#000000',
                'Milwaukee Bucks': '#2D5234', 'Charlotte Hornets': '#3B8DAA', 'Detroit Pistons': '#0C519A',
                'Washington Wizards': '#C73531', 'New York Knicks': '#EE8133', 'San Antonio Spurs': '#000000',
                'Utah Jazz': '#00275E', 'Sacramento Kings': '#393997', 'Los Angeles Clippers': '#D73932',
                'New Orleans Pelicans': '#0C2340', 'Golden State Warriors': '#0D529C', 'Atlanta Hawks':'#DD3C3D',
                'Dallas Mavericks': '#0157B8', 'Oklahoma City Thunder': '#297CC2', 'Houston Rockets': '#DA3A2F'}

In [149]:
fig = px.bar(merged, x='diff', y='visitor', color='Team', orientation='h', text='diff')
fig.update_layout(width=800, height=650, xaxis=dict(title_text='Number of Games'),
                plot_bgcolor='white', margin={'l': 0, 'b': 0, 't': 30, 'r': 0}, title='Weekly Team Matchups Difficulty')

### NBA Standing Data
* Use for schedule tab to show how good teams are and where they stand

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2021_standings.html'
tables = pd.read_html(url)

In [3]:
east = tables[0]
east.rename(columns={'Eastern Conference': 'Team'}, inplace=True)
east['Conference'] = 'East'
west = tables[1]
west.rename(columns={'Western Conference': 'Team'}, inplace=True)
west['Conference'] = 'West'

In [4]:
def conference_seed_extract(val):
    return val.split()[-1].strip('()')
def remove_seed(val):
    return ' '.join(val.split()[:-1])

In [18]:
standing = pd.concat([tables[0], tables[1]])
standing['Rank'] = standing['Team'].apply(conference_seed_extract)
standing['Team'] = standing['Team'].apply(remove_seed)
standing.to_csv('2021_standings.csv', index=False)