In [2]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import urllib.request

Get links to every team's page

In [6]:
%%time
def get_teams(soup):
    '''Takes BeautifulSoup object of NBA teams page and returns
       list of links to team pages.'''
   
    for link in soup.find_all('a'):
        team = link.text
        if 'teams' in link.get('href'):
            yield link.get('href')
        if team == 'Washington Wizards':
            break

site = 'https://www.basketball-reference.com/teams/'
page= urllib.request.urlopen(site)
soup = BeautifulSoup(page, 'lxml')

teamlinks = tuple(get_teams(soup))[1:]

CPU times: user 77.1 ms, sys: 4.1 ms, total: 81.2 ms
Wall time: 1.16 s


From teamlinks, get every rostered player in league.

In [17]:
%%time
def get_players(teams, year):
    '''From list of teams, get list of all players on that 
       team for a given year. This is only set up to work
       on seasons from 2009-10 on. There may be errors going
       further back.'''
    
    year = str(year)
    
    for team in teams:
        link = 'https://www.basketball-reference.com' + team + year + '.html'
        if 'NJN' in link:
            if year > '2012':
                link = 'https://www.basketball-reference.com/teams/BRK/' + year + '.html'
            else:
                link = 'https://www.basketball-reference.com/teams/NJN/' + year + '.html'
        if 'CHA' in link:
                if year > '2014':
                    link = 'https://www.basketball-reference.com/teams/CHO/' + year + '.html'
                elif year < '2003':
                    link = 'https://www.basketball-reference.com/teams/CHH/' + year + '.html'
                else:
                    'https://www.basketball-reference.com/teams/CHA/'+ year + '.html'
                
        if 'NOH' in link:
            if year > '2013':
                link = 'https://www.basketball-reference.com/teams/NOP/' + year + '.html'
            else:
                 link = 'https://www.basketball-reference.com/teams/NOH/' + year + '.html'
               
        page = urllib.request.urlopen(link)
        soup = BeautifulSoup(page, 'lxml')

        for i in soup.find_all('table')[0].find_all('a'):
            if 'players' in i['href']:
                yield (i.text, i['href'])

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.2 µs


In [18]:
%%timeit
player_set = set()
for year in range(2010,2020):
    new_set = set(get_players(teamlinks, year))
    if len(player_set) == 0:
        player_set = new_set
    else:
        player_set.update(new_set)
    print(year, len(player_set))

2010 442
2011 523
2012 611
2013 692
2014 770
2015 852
2016 926
2017 1014
2018 1133
2019 1238
CPU times: user 23.8 s, sys: 1.99 s, total: 25.8 s
Wall time: 15min 40s


Get stats for the last season for each player

In [23]:
def get_stats(player_set, start_year):
    """Takes a set of players and pulls their stats """
    
    for player in player_set:
        reference_site = 'https://www.basketball-reference.com' + player[1]
        page = urllib.request.urlopen(reference_site)
        soup = BeautifulSoup(page, 'lxml')
        per_game_table = soup.table

        # Excludes rookies from the table
        if per_game_table != None:
            raw_height = str(soup.find_all('div', {'id':'info'})[0].find_all('span', {'itemprop':'height'})[0].string)
            height_tup = raw_height.split('-')
            height = int(height_tup[0])*12 + int(height_tup[1])
            player_df = pd.read_html(reference_site)[0]
            player_df['Height'] = height
            player_df['href'] = player[1]
            player_df['Player'] = player[0]

            # removes career totals
            player_df = player_df[player_df.Season != 'Career']
            # removes seasons where players did not play
            player_df = player_df[player_df.Tm != player_df.Lg]
            # remove totals for each team player played on
            player_df = player_df[player_df.Age.notna()]
            # trim season stat to be the starting year
            player_df = player_df.Season.str[:4]
            # removes seasons before 2009-10
            player_df = player_df[player_df['Season'] > '2008']
            all_season_df = all_season_df.append(player_df)
            if x % 50:
                print(x, player_df.Player.unique())
            x += 1

In [56]:
%%time
all_season_df = pd.DataFrame()
x=1
for player in player_set:
    reference_site = 'https://www.basketball-reference.com' + player[1]
    page = urllib.request.urlopen(reference_site)
    soup = BeautifulSoup(page, 'lxml')
    per_game_table = soup.table
    
    # Excludes rookies from the table
    if per_game_table != None:
        height = str(soup.find_all('div', {'id':'info'})[0].find_all('span', {'itemprop':'height'})[0].string)
        height = height.split('-')
        height = int(height[0])*12 + int(height[1])
        player_df = pd.read_html(reference_site)[0]
        player_df['Height'] = height
        player_df['href'] = player[1]
        player_df['Player'] = player[0]
        # removes career totals
        player_df = player_df[player_df.Season != 'Career']
        # removes seasons where players did not play
        player_df = player_df[player_df.Tm != player_df.Lg]
        # remove totals for each team player played on
        player_df = player_df[player_df.Age.notna()]
        # trim season stat to be the starting year
        player_df['Season'] = player_df.Season.str[:4]
        # removes seasons before 2009-10
        player_df = player_df[player_df['Season'] > '2008']
        all_season_df = all_season_df.append(player_df)
        x += 1

1 ['Isaiah Taylor']
2 ['Frank Jackson']
3 ['Oleksiy Pecherov']
4 ['Maxi Kleber']
5 ['Steven Adams']
6 ['Sasha Kaun']
7 ['Grant Jerrett']
8 ['Johan Petro']
9 ['Deron Williams']
10 ['Ed Davis']
11 ['Andre Drummond']
12 ['Marc Gasol']
13 ['Joe Young']
14 ['Tahjere McCall']
15 ['D.J. White']
16 ['Shai Gilgeous-Alexander']
17 ['Tim Frazier']
18 ['Mirza Teletović']
19 ['Nenad Krstić']
20 ['Sherron Collins']
21 ['Zhaire Smith']
22 ['Tyler Davis']
23 ['Ersan İlyasova']
24 ['Earl Boykins']
25 ['Allen Crabbe']
26 ['Gary Payton']
27 ['Juan Hernangómez']
28 ['Willie Reed']
29 ["De'Aaron Fox"]
30 ['DeShawn Stevenson']
31 ['Larry Owens']
32 ['Austin Daye']
33 ['Antoine Wright']
34 ['Jae Crowder']
35 ['Kay Felder']
36 ['Christian Wood']
37 ['Keita Bates-Diop']
38 ['Mickaël Gelabale']
39 ['Marvin Williams']
40 ['Derrick Rose']
41 ['Will Bynum']
42 ['Tyus Jones']
43 ['Raul Neto']
44 ['Lauri Markkanen']
45 ['Andrés Nocioni']
46 ['Vernon Macklin']
47 ['Derrick Brown']
48 ['Matthew Dellavedova']
49 ['Jame

384 ['Ricky Rubio']
385 ['Jermaine Taylor']
386 ['Jared Sullinger']
387 ['Jeremy Lamb']
388 ['Mike Bibby']
389 ['Dwayne Jones']
390 ['Edwin Ubiles']
391 ['Lance Stephenson']
392 ['Jerel McNeal']
393 ['Bam Adebayo']
394 ['Matt Costello']
395 ['Henry Walker']
396 ['Kristaps Porziņģis']
397 ['Chris Kaman']
398 ['Mike Scott']
399 ['Spencer Dinwiddie']
401 ['Trenton Hassell']
402 ['Karl-Anthony Towns']
403 ['Mitchell Robinson']
404 ['Khem Birch']
405 ['Coby Karl']
406 ['James Anderson']
407 ['Nemanja Nedović']
408 ['Royce White']
409 ['Tomáš Satoranský']
410 ['Channing Frye']
411 ['Joel Embiid']
412 ['Johnathan Williams']
413 ['Jorge Gutiérrez']
414 ['Trae Young']
415 ['Rodney Carney']
416 ['Jamil Wilson']
417 ['Jeremy Lin']
418 ['Melvin Frazier']
419 ['Sonny Weems']
420 ['Kevon Looney']
421 ['Gerald Wallace']
422 ['Yogi Ferrell']
423 ['Xavier Silas']
424 ['Draymond Green']
425 ['Sviatoslav Mykhailiuk']
426 ['Tyshawn Taylor']
427 ['Nigel Hayes']
428 ['Ben Gordon']
429 ['Pops Mensah-Bonsu']


766 ['Marcus Camby']
767 ['Courtney Lee']
768 ['Nerlens Noel']
769 ['Jakob Poeltl']
770 ['Marcin Gortat']
771 ['John Wall']
772 ['Chandler Parsons']
773 ['Ryan Reid']
774 ['Wendell Carter']
775 ['Wesley Matthews']
776 ['Jerryd Bayless']
777 ['Joe Smith']
778 ['Blake Griffin']
779 ['Josh Jackson']
780 ['Jarell Martin']
781 ['Earl Watson']
782 ['Hasheem Thabeet']
783 ['Isaiah Briscoe']
784 ['Kurt Thomas']
785 ['Jerome Robinson']
786 ['Elie Okobo']
787 ['Corey Maggette']
788 ['Cedric Jackson']
789 ['Derrick Williams']
790 ['Jamal Murray']
791 ['Will Conroy']
792 ['Kawhi Leonard']
793 ['Alex Len']
794 ['Erik McCree']
795 ['Brad Miller']
796 ['Bryce Cotton']
797 ['Darius Miller']
798 ['Jawun Evans']
799 ['Mike Harris']
801 ['Josh Boone']
802 ['Gerald Green']
803 ['Mario West']
804 ['Toney Douglas']
805 ['Mike Miller']
806 ['Ben Bentil']
807 ['David Wear']
808 ['Jarrett Allen']
809 ['T.J. Warren']
810 ['DeMar DeRozan']
811 ['Andrew Harrison']
812 ['Dirk Nowitzki']
813 ['Malik Beasley']
814 [

1143 ['Trey Lyles']
1144 ['Paul George']
1145 ['Nate Wolters']
1146 ['Pat Connaughton']
1147 ['Omari Spellman']
1148 ['Diante Garrett']
1149 ['Dwight Buycks']
1151 ['Jordan Sibert']
1152 ['Alonzo Gee']
1153 ['Nick Collison']
1154 ['Arinze Onuaku']
1155 ['Dionte Christmas']
1156 ['Reggie Jackson']
1157 ['Luc Mbah a Moute']
1158 ['Luigi Datome']
1159 ['Jack Cooley']
1160 ['Jamaal Franklin']
1161 ['Tyler Johnson']
1162 ['Pero Antić']
1163 ['Amir Johnson']
1164 ['Chris Johnson']
1165 ['Kyle Wiltjer']
1166 ['Álex Abrines']
1167 ['Rodney Stuckey']
1168 ['Wayne Ellington']
1169 ['Marcus Cousin']
1170 ['Gary Harris']
1171 ['Emmanuel Mudiay']
1172 ['Terrence Ross']
1173 ['Tony Parker']
1174 ['Eddy Curry']
1175 ['James Harden']
1176 ['Trevon Duval']
1177 ['Langston Galloway']
1178 ['Darrun Hilliard']
1179 ['Aaron Jackson']
1180 ['Marcus Derrickson']
1181 ['Anderson Varejão']
1182 ['Jarron Collins']
1183 ["Amar'e Stoudemire"]
1184 ['Dion Waiters']
1185 ['Keith Appling']
1186 ['Jerome Dyson']
1187

In [57]:
all_season_df.reset_index(inplace=True, drop=True)

In [58]:
all_season_df.to_csv('Data/all_season.csv')

In [68]:
with open('players.txt', 'w') as f:
    for player in player_set:
        f.write("%s\n" % str(player))