In [9]:
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request

In [10]:
# Access Basketball Reference to get league leaders in win shares for each season
#specify the url
site = "https://www.basketball-reference.com/leaders/hof_prob.html"
#Query the website and return the html to the variable 'page'
page = urllib.request.urlopen(site)
#Parse the html in the 'page' variable and store it in Beautiful Soup format
soup = BeautifulSoup(page, 'lxml')

In [11]:
# Save two Hall of Fame Probability leader tables (all time and active)
alltime_table, active_table = soup.find_all('table')

In [12]:
#create list tuples of players and href 
alltime_list = []
active_list = []
for line in alltime_table('td'):
    try:
        alltime_list.append((str(line.a.string), str(line.a.get('href'))))
    except:
        pass
    
for line in active_table('td'):
    try:
        active_list.append((str(line.a.string), str(line.a.get('href'))))
    except:
        pass

In [13]:
len(alltime_list), alltime_list[:10]

(250,
 [('Kareem Abdul-Jabbar', '/players/a/abdulka01.html'),
  ('Michael Jordan', '/players/j/jordami01.html'),
  ('Bill Russell', '/players/r/russebi01.html'),
  ('Kobe Bryant', '/players/b/bryanko01.html'),
  ('Wilt Chamberlain', '/players/c/chambwi01.html'),
  ('LeBron James', '/players/j/jamesle01.html'),
  ('Tim Duncan', '/players/d/duncati01.html'),
  ("Shaquille O'Neal", '/players/o/onealsh01.html'),
  ('John Havlicek', '/players/h/havlijo01.html'),
  ('Oscar Robertson', '/players/r/roberos01.html')])

In [14]:
len(active_list), active_list[:10]

(100,
 [('LeBron James', '/players/j/jamesle01.html'),
  ('Dwyane Wade', '/players/w/wadedw01.html'),
  ('Dirk Nowitzki', '/players/n/nowitdi01.html'),
  ('Kevin Durant', '/players/d/duranke01.html'),
  ('Chris Paul', '/players/p/paulch01.html'),
  ('Stephen Curry', '/players/c/curryst01.html'),
  ('Russell Westbrook', '/players/w/westbru01.html'),
  ('James Harden', '/players/h/hardeja01.html'),
  ('Dwight Howard', '/players/h/howardw01.html'),
  ('Carmelo Anthony', '/players/a/anthoca01.html')])

In [15]:
# Create list of players on active_list but not alltime_list
# and add to alltime_list
active_list = list(set(active_list).difference(set(alltime_list)))
alltime_list += active_list
# Print new list length and last 10 players of list
print(len(alltime_list))
print(alltime_list[-10:])

310
[('CJ McCollum', '/players/m/mccolcj01.html'), ('Jamal Crawford', '/players/c/crawfja01.html'), ('JaVale McGee', '/players/m/mcgeeja01.html'), ('Gordon Hayward', '/players/h/haywago01.html'), ('Devin Harris', '/players/h/harride01.html'), ('Serge Ibaka', '/players/i/ibakase01.html'), ('Andrew Bogut', '/players/b/bogutan01.html'), ('Mike Conley', '/players/c/conlemi01.html'), ('Danny Green', '/players/g/greenda02.html'), ('Thaddeus Young', '/players/y/youngth01.html')]


In [8]:
# This for loop gets the season by season stats for each player in the 
# alltime_list. First it turns the player page html into a Beautiful Soup
# object to make the stats table. I also pull the player's height from
# the top of the page, and convert it from a string 'x-xx' to inches.
all_seasons = [] 
for player in alltime_list[:20]:
    reference_site = 'https://www.basketball-reference.com' + player[1]
    page = urllib.request.urlopen(reference_site)
    soup = BeautifulSoup(page, 'lxml')
    per_game_table = soup.table
    
    height = str(soup.find_all('div', {'id':'info'})[0].find_all('span', {'itemprop':'height'})[0].string)
    height = height.split('-')
    height = int(height[0])*12 + int(height[1])
    
    # This section gets stats values for a single season and 
    # makes a single list
    values = []
    length = 0
    for row in (per_game_table('tr')):
        for num, column in enumerate(row):
            # Players from different decades have different
            # available stats, but PTS is always the last column
            if column.string == 'PTS':
                length = int((num+1) / 2)
            if column != '\n':
                values.append(str(column.string))
    categories = values[:length]
    # Ignores category names
    values = values[length:]

    # Create list of individual season stats lists
    player_career = [['Player', 'href', 'Height'] + categories]
    season = [player[0], player[1], height]
    for value in values:
        if value.startswith('Did'):
            # Find players who took time off mid-career
            print(player[0], value)
        season.append(str(value))
        #added and (season[3] != None)
        if (len(season) == length+3) and (season[4] != 'None'):
            player_career.append(season)
            season = [player[0], player[1], height]
        # Player must have played at least 12 seasons
    #if (len(player_career) > 12):# and (len(player_career[0]) == 33):
    all_seasons.append(player_career)
    print(str((len(player_career)-1)), ' seasons of ', player[0], 'added')


20  seasons of  Kareem Abdul-Jabbar added
Michael Jordan Did Not Play (retired)
Michael Jordan Did Not Play (retired)
Michael Jordan Did Not Play (retired)
17  seasons of  Michael Jordan added
13  seasons of  Bill Russell added
20  seasons of  Kobe Bryant added
16  seasons of  Wilt Chamberlain added
16  seasons of  LeBron James added
19  seasons of  Tim Duncan added
21  seasons of  Shaquille O'Neal added
16  seasons of  John Havlicek added
14  seasons of  Oscar Robertson added
Bob Cousy Did Not Play (retired)
Bob Cousy Did Not Play (retired)
Bob Cousy Did Not Play (retired)
Bob Cousy Did Not Play (retired)
Bob Cousy Did Not Play (retired)
Bob Cousy Did Not Play (retired)
15  seasons of  Bob Cousy added
23  seasons of  Kevin Garnett added
14  seasons of  Jerry West added
Magic Johnson Did Not Play (illness—HIV)
Magic Johnson Did Not Play (illness—HIV)
Magic Johnson Did Not Play (illness—HIV)
Magic Johnson Did Not Play (illness—HIV)
14  seasons of  Magic Johnson added
19  seasons of  Kar

In [9]:
# Create pandas DataFrame of all data
category_list= ['Player', 'href', 'Height', 'Season', 'Age', 'Tm', 'Lg', 
                'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', 
                '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'
                ]

In [10]:
# CREATE LIST OF DATAFRAMES AND MERGE AT END
df_list = []
season_df_list = []
master_df = pd.DataFrame(columns=category_list)
for career in all_seasons:
    labels=career[0]
    for season in career[1:]:
        season_df = pd.DataFrame(data=season, index=labels)
        season_df_list.append(season_df.transpose())
    print(career[1][0] + ' added')
career_df = pd.concat(season_df_list, ignore_index=True)
    

career_df.to_csv('all-stats-sample.csv', columns = category_list)

Kareem Abdul-Jabbar added
Michael Jordan added
Bill Russell added
Kobe Bryant added
Wilt Chamberlain added
LeBron James added
Tim Duncan added
Shaquille O'Neal added
John Havlicek added
Oscar Robertson added
Bob Cousy added
Kevin Garnett added
Jerry West added
Magic Johnson added
Karl Malone added
Hakeem Olajuwon added
Dwyane Wade added
Dirk Nowitzki added
Larry Bird added
Elvin Hayes added


In [11]:
# This is a potential alternative to my current way of working through the players in
# the HOF list. I probably don't need to investigate since I got what I needed, but understanding
# how this works would be beneficial
all_rows = {}
for player in alltime_list[:20]:
    reference_site = 'https://www.basketball-reference.com' + player[1]
    page = urllib.request.urlopen(reference_site)
    soup = BeautifulSoup(page, 'lxml')
    per_game_table = soup.table
    
    output_rows = []
    for table_row in per_game_table.findAll('tr'):
        columns = table_row.findAll('td')
        output_row = []
        for column in columns:
            output_row.append(column.text)
        output_rows.append(output_row)
    all_rows[player[0]] = output_rows


In [12]:
for x in per_game_table.th('th'):
    print(x.text)
len(per_game_table.findAll('th'))

50

In [13]:
all_rows['Bill Russell']

[[],
 ['22',
  'BOS',
  'NBA',
  'C',
  '48',
  '',
  '35.3',
  '5.8',
  '13.5',
  '.427',
  '3.2',
  '6.4',
  '.492',
  '19.6',
  '1.8',
  '3.0',
  '14.7'],
 ['23',
  'BOS',
  'NBA',
  'C',
  '69',
  '',
  '38.3',
  '6.6',
  '15.0',
  '.442',
  '3.3',
  '6.4',
  '.519',
  '22.7',
  '2.9',
  '2.6',
  '16.6'],
 ['24',
  'BOS',
  'NBA',
  'C',
  '70',
  '',
  '42.6',
  '6.5',
  '14.2',
  '.457',
  '3.7',
  '6.1',
  '.598',
  '23.0',
  '3.2',
  '2.3',
  '16.7'],
 ['25',
  'BOS',
  'NBA',
  'C',
  '74',
  '',
  '42.5',
  '7.5',
  '16.1',
  '.467',
  '3.2',
  '5.3',
  '.612',
  '24.0',
  '3.7',
  '2.8',
  '18.2'],
 ['26',
  'BOS',
  'NBA',
  'C',
  '78',
  '',
  '44.3',
  '6.8',
  '16.0',
  '.426',
  '3.3',
  '6.0',
  '.550',
  '23.9',
  '3.4',
  '2.0',
  '16.9'],
 ['27',
  'BOS',
  'NBA',
  'C',
  '76',
  '',
  '45.2',
  '7.6',
  '16.6',
  '.457',
  '3.8',
  '6.3',
  '.595',
  '23.6',
  '4.5',
  '2.7',
  '18.9'],
 ['28',
  'BOS',
  'NBA',
  'C',
  '78',
  '',
  '44.9',
  '6.6',
  '15.2',
 

In [63]:
# This is a potential alternative to my current way of working through the players in
# the HOF list. I probably don't need to investigate since I got what I needed, but understanding
# how this works would be beneficial

# THIS WORKS FOR PLAYERS THAT DO NOT MISS ANY YEARS 

# Need to figure out how to handle 
all_rows = {}

for player in alltime_list[:5]:
    print(player)
    reference_site = 'https://www.basketball-reference.com' + player[1]
    page = urllib.request.urlopen(reference_site)
    soup = BeautifulSoup(page, 'lxml')
    per_game_table = soup.table
    
    output_rows = []
    headers = per_game_table.findAll('th')
    headtext = []
    for header in headers:
        headtext.append(header.text)
    for table_row in per_game_table.findAll('tr'):
        columns = table_row.findAll('td')
        output_row = []
        for column in columns:
            output_row.append(column.text)           
        output_rows.append(output_row)
    #print(headtext[30:])
    # take headers and insert values into player season stats lists
    output_rows[0] = headtext[:headtext.index('PTS')+1]
    #print(output_rows)
    #print(len(headtext[headtext.index('PTS')+1:]))
    for i, row in enumerate(output_rows[1:]):
        if 'Did Not Play' in row[2]:
            try:
                output_rows[i+1].insert(0, headtext[headtext.index('PTS')+1+i])
            except:
                pass
        else:
            try:
                output_rows[i+1].insert(0, headtext[headtext.index('PTS')+1+i])
            except:
                pass
    all_rows[player[0]] = output_rows

('Kareem Abdul-Jabbar', '/players/a/abdulka01.html')
('Michael Jordan', '/players/j/jordami01.html')
('Bill Russell', '/players/r/russebi01.html')
('Kobe Bryant', '/players/b/bryanko01.html')
('Wilt Chamberlain', '/players/c/chambwi01.html')


<th class="left " data-stat="season" scope="row">2 seasons</th>

In [56]:
for i in per_game_table.findAll('th'):
    if len(i.text) == 7:
        print(i.text)

1984-85
1985-86
1986-87
1987-88
1988-89
1989-90
1990-91
1991-92
1992-93
1994-95
1995-96
1996-97
1997-98
2001-02
2002-03


In [71]:
new_df = pd.DataFrame()

for player, seasons in all_rows.items():
    playdf = pd.DataFrame(columns=seasons[0])
    for season in seasons[1:]:
        print(player)
        season_list= [player] + season
        playdf = pd.concat([playdf, pd.DataFrame(season_list).T])
    

Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Kareem Abdul-Jabbar
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Michael Jordan
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill Russell
Bill R

  result = result.union(other)
  result = result.union(other)


Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain
Wilt Chamberlain


In [90]:
for tag in soup.find('table', id='per_game').find('tr'):
    print(tag)



<th aria-label="If listed as single number, the year the season ended.★ - Indicates All-Star for league.Only on regular season tables." class=" poptip sort_default_asc center" data-stat="season" data-tip="If listed as single number, the year the season ended.&lt;br&gt;★ - Indicates All-Star for league.&lt;br&gt;Only on regular season tables." scope="col">Season</th>


<th aria-label="Age of Player at the start of February 1st of that season." class=" poptip sort_default_asc center" data-stat="age" data-tip="Age of Player at the start of February 1st of that season." scope="col">Age</th>


<th aria-label="Team" class=" poptip sort_default_asc center" data-stat="team_id" data-tip="Team" scope="col">Tm</th>


<th aria-label="League" class=" poptip sort_default_asc center" data-stat="lg_id" data-tip="League" scope="col">Lg</th>


<th aria-label="Position" class=" poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>


<th aria-label="Games" class=" popt

In [58]:
# This for loop gets the season by season stats for each player in the 
# alltime_list. First it turns the player page html into a Beautiful Soup
# object to make the stats table. I also pull the player's height from
# the top of the page, and convert it from a string 'x-xx' to inches.
all_seasons = [] 
for player in alltime_list[:2]:
    reference_site = 'https://www.basketball-reference.com' + player[1]
    page = urllib.request.urlopen(reference_site)
    soup = BeautifulSoup(page, 'lxml')
    per_game_table = soup.table
    
    height = str(soup.find_all('div', {'id':'info'})[0].find_all('span', {'itemprop':'height'})[0].string)
    height = height.split('-')
    height = int(height[0])*12 + int(height[1])
    
    # This section gets stats values for a single season and 
    # makes a single list
    values = []
    length = 0
    for row in (per_game_table('tr')):
        for num, column in enumerate(row):
            # Players from different decades have different
            # available stats, but PTS is always the last column
            if column.string == 'PTS':
                length = int((num+1) / 2)
            if column != '\n':
                values.append(str(column.string))
    categories = values[:length]
    # Ignores category names
    values = values[length:]

    # Create list of individual season stats lists
    player_career = [['Player', 'href', 'Height'] + categories]
    season = [player[0], player[1], height]
    for value in values:
        if value.startswith('Did'):
            # Find players who took time off mid-career
            print(player[0], value)
        season.append(str(value))
        #added and (season[3] != None)
        if (len(season) == length+3) and (season[4] != 'None'):
            player_career.append(season)
            season = [player[0], player[1], height]
        # Player must have played at least 12 seasons
    #if (len(player_career) > 12):# and (len(player_career[0]) == 33):
    all_seasons.append(player_career)
    print(str((len(player_career)-1)), ' seasons of ', player[0], 'added')



20  seasons of  Kareem Abdul-Jabbar added
Michael Jordan Did Not Play (retired)
Michael Jordan Did Not Play (retired)
Michael Jordan Did Not Play (retired)
17  seasons of  Michael Jordan added


In [65]:
per_game_table

<table class="row_summable sortable stats_table" data-cols-to-freeze="1" id="per_game"><caption>Per Game Table</caption>
<colgroup><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col></colgroup>
<thead>
<tr>
<th aria-label="If listed as single number, the year the season ended.★ - Indicates All-Star for league.Only on regular season tables." class=" poptip sort_default_asc center" data-stat="season" data-tip="If listed as single number, the year the season ended.&lt;br&gt;★ - Indicates All-Star for league.&lt;br&gt;Only on regular season tables." scope="col">Season</th>
<th aria-label="Age of Player at the start of February 1st of that season." class=" poptip sort_default_asc center" data-stat="age" data-tip="Age

In [38]:
column.string

'21.2'