In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import time
from collections import defaultdict

In [2]:
def clean_bkref_dataframe(df):
    """ Cleans dataframes we get from importing bkref CSV files.
    """
    df = df.drop(columns=['Unnamed: 29'])
    df = df.drop(columns=['Rk'])
    df = df.drop_duplicates(subset=['Player'])
    df['Year'] = 2019
    df[['Player', 'player_id']] = df['Player'].str.split('\\', expand=True)
    df = df.reset_index(drop=True)
    return df

In [12]:
team_df = pd.read_csv("data/nba-api-team-stats.csv")
team_df = team_df.drop(columns=['Unnamed: 0'])
team_df = team_df[team_df.YEAR >= 1980]
team_df.columns = ['Tm', 'Year', 'GP', 'WINS', 'LOSSES', 'WIN_PCT', 'CONF_RANK',
       'DIV_RANK', 'PO_WINS', 'PO_LOSSES', 'CONF_COUNT', 'DIV_COUNT',
       'NBA_FINALS_APPEARANCE', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'PF',
       'STL', 'TOV', 'BLK', 'PTS', 'PTS_RANK']
team_df.to_json("data/team-stats.json")

In [13]:
player_df = pd.read_json("data/1980-2018-per100-labeled.json")
player_team_combined = pd.merge(player_df, team_df, on=['Tm','Year'])
player_team_combined.to_json("data/combined-player-team-stats.json")

Using BeautifulSoup to parse basketball-reference.com per possession stats

In [21]:
def get_df_from_html(html_file):
    with open(html_file) as html_doc:
        soup = BeautifulSoup(html_doc, 'html.parser')
    table = soup.find("table")
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    table_data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        table_data.append([ele for ele in cols]) 
    
    df = pd.DataFrame(table_data)
    
    table_head = table.find('thead')
    headers = table_head.find_all('th')
    header_data = []
    cols = table_head.find_all('th')
    cols = [ele.text.strip() for ele in cols]
    header_data.append([ele for ele in cols])
    header_data = header_data[0]
    header_data.remove('Rk')
    df.columns = header_data
    
    return df

def process_soupy_df(df, yr):
    df['Year'] = yr
    df.to_json(f"data/scraped-player-data-{yr}.json")
    df = pd.read_json(f"data/scraped-player-data-{yr}.json")
    df.replace('', np.nan)
    df = df.loc[df.MP > 300]
    df = df.fillna(0)
    return df

Business happens down here. We scrape 2019 stats and combine them with 2019 team stats.

In [22]:
yr = 2019
team_df = pd.read_json("data/team-stats.json")
stats_url = f"https://www.basketball-reference.com/leagues/NBA_{yr}_per_poss.html"
stats_fname = f"data/{yr}_per_poss.html"
urllib.request.urlretrieve(stats_url, filename=stats_fname)
scraped_player_df = get_df_from_html(stats_fname)
scraped_player_df = process_soupy_df(scraped_player_df, yr)


combined_19 = pd.merge(scraped_player_df, team_df, on=['Tm','Year'])
combined_19.to_json("data/2019-combined.json")

Get advanced stat HTML files

In [23]:
# Get updated advanced stats for this season
urllib.request.urlretrieve(f"https://www.basketball-reference.com/leagues/NBA_{yr}_advanced.html",
                               filename=f"data/{yr}_adv.html")
## Get all advanced stat files. 
# for yr in range(1980, 2020):
#     time.sleep(5)
#     urllib.request.urlretrieve(f"https://www.basketball-reference.com/leagues/NBA_{yr}_advanced.html",
#                                filename=f"data/{yr}_adv.html")

('data/2019_adv.html', <http.client.HTTPMessage at 0x1178a2668>)

In [24]:
adv_stat_df = pd.DataFrame()
for y in range(1980, yr+1):
    curr_df = get_df_from_html(f"data/{y}_adv.html")
    curr_df['Year'] = y
    adv_stat_df = adv_stat_df.append(curr_df)

In [25]:
adv_stat_df = adv_stat_df[['Player', 'Year', 'Tm', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']]
#adv_stat_df.to_json("data/advanced-stats.json")
adv_combined = pd.merge(player_team_combined, adv_stat_df, on=['Player','Year', 'Tm'])
adv_combined.to_json("data/combined-player-team-stats.json")

In [26]:
pdf_19 = pd.read_json("data/2019-combined.json")
adv_combined_19 = pd.merge(pdf_19, adv_stat_df, on=['Player','Year', 'Tm'])
adv_combined_19.to_json("data/2019-combined.json")