# Scrape season totals for every player from basketball reference

In [7]:
import pandas as pd
import requests

from bs4 import BeautifulSoup

In [8]:
def parse_html_table(html_table):
    '''For parsing basketball reference stats table'''
    data = []
    cur_row = []
    row_names = []
    for ele in html_table:
        stat_name  = ele['data-stat']
        stat_value = ele.string
        new_row = (stat_name == 'player')
        if new_row:
            if cur_row:
                data.append(cur_row)
            cur_row = []
            col_names = []
            cur_row.append(ele['csk']) # fixes weird asterisk error
            col_names.append(stat_name)
            continue
        cur_row.append(stat_value)
        col_names.append(stat_name)
    return data, col_names

In [9]:
# Loop thru each year and collect data
dfs = []
for yr in range(1980,(2016 + 1)):
    url = 'http://www.basketball-reference.com/leagues/NBA_{yr}_totals.html'.format(yr = yr)
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    yr_data, col_names = parse_html_table(soup.findAll('td'))
    df = pd.DataFrame(yr_data, columns = col_names)
    df['yr'] = yr
    dfs.append(df)

In [10]:
all_seasons = pd.concat(dfs)
all_seasons.to_csv('data/season_totals.csv')