In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np
import re
import ast
pd.set_option('display.max_columns', 100)
pd.options.display.max_rows = 4000
import seaborn as sns

In [2]:
#Letters to iterate through
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']
#Empty lists to store our data
names_list = []
links_list = []
start_list = []
end_list = []


In [3]:
for letter in alphabet:
    #Link to webpage using requests and BeautifulSoup
    page = requests.get('https://www.basketball-reference.com/players/' + letter)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #Select The table then the rows
    table = soup.find('table', class_ = 'sortable stats_table')
    rows = table.findAll('tr')
    
    #Iterate through the rows and get all player names
    new_names_list = [row.find('th') for row in rows]
    new_names_list = [name.find('a') for name in new_names_list]
    new_names_list = [name for name in new_names_list if name]
    new_names_list = [name.string for name in new_names_list]
    
    #Iterate through the rows and get links to all player pages
    new_links_list = [row.find('th') for row in rows]
    new_links_list = [link.find('a') for link in new_links_list]
    new_links_list = [link for link in new_links_list if link]
    new_links_list = [link['href'] for link in new_links_list]
    
    #Iterate and get first year of a players career
    new_start_list = [row.findAll('td') for row in rows]
    new_start_list = [year for year in new_start_list if year]
    new_start_list = [year[0].string for year in new_start_list]
    
    #Iterate and get last year
    new_end_list = [row.findAll('td') for row in rows]
    new_end_list = [year for year in new_end_list if year]
    new_end_list = [year[1].string for year in new_end_list]
    
    #Add the new data to our lists
    names_list = names_list + new_names_list
    links_list = links_list + new_links_list
    start_list = start_list + new_start_list
    end_list = end_list + new_end_list

#Turn our lists into a dataframe and name the columns    
df = pd.DataFrame([names_list, links_list, start_list, end_list]).transpose()
df.columns = ['Player', 'Link', 'StartYear', 'EndYear']
df

Unnamed: 0,Player,Link,StartYear,EndYear
0,Alaa Abdelnaby,/players/a/abdelal01.html,1991,1995
1,Zaid Abdul-Aziz,/players/a/abdulza01.html,1969,1978
2,Kareem Abdul-Jabbar,/players/a/abdulka01.html,1970,1989
3,Mahmoud Abdul-Rauf,/players/a/abdulma02.html,1991,2001
4,Tariq Abdul-Wahad,/players/a/abdulta01.html,1998,2003
...,...,...,...,...
4795,Ante Žižić,/players/z/zizican01.html,2018,2020
4796,Jim Zoet,/players/z/zoetji01.html,1983,1983
4797,Bill Zopf,/players/z/zopfbi01.html,1971,1971
4798,Ivica Zubac,/players/z/zubaciv01.html,2017,2020


In [4]:
#Change data types of years to int64 and creating a column for amount of time in league
df.StartYear = df.StartYear.astype('int64')
df.EndYear = df.EndYear.astype('int64')
df['Tenure'] = df.EndYear - df.StartYear

In [5]:
df_eligible = df[(df.EndYear <= 2016) & (df.Tenure > 4)]
df_eligible.reset_index(inplace = True)
#Limiting our data to players who retired in 2016 or earlier so that all players are hall of fame eligible
#Also removed any player who didn't last more than 4 seasons

In [6]:

def get_player_info(href1):
    page = requests.get('https://www.basketball-reference.com'+str(href1))
    #Name of Player
    #soup = BS(page.content, 'html.parser')
    #table_body=soup.find_all(itemprop= "name")
    #table_body
    #name= re.findall('<h1 itemprop=\"name\">(.*)</h1>', str(table_body))
    #if len(name)==0:
    #    name= re.findall('<span>(.*)</span>', str(table_body))
    #else:
    #    pass
    
    #Bio
    soup = BeautifulSoup(page.content, 'html.parser')
    table_body=soup.find_all('p')
    work=table_body[0:10]
    #print(work)
    height= re.findall('(\d*cm)', str(work))
    weight= re.findall('(\d*kg)', str(work))
    bday= re.findall('data-birth=\"(\d*\-\d*\-\d*)', str(work))
    #accolades
    accolades_body=soup.find(id="bling")
    All_Star_apps= re.findall('(\d*)x All Star', str(accolades_body))
    All_NBA_apps= re.findall('(\d*)x All-NBA', str(accolades_body))
    All_Def_apps= re.findall('(\d*)x All-Defensive', str(accolades_body))
    HOF= re.findall('(Hall of Fame)', str(accolades_body))
    empty_list=0
    if len(All_Star_apps)==empty_list:
        All_Star_apps='0'
    else:
        All_Star_apps=All_Star_apps[0]
    if len(All_NBA_apps)==empty_list:
        All_NBA_apps='0'
    else:
        All_NBA_apps=All_NBA_apps[0]
    if len(All_Def_apps)==empty_list:
        All_Def_apps='0'
    else:
        All_Def_apps= All_Def_apps[0]
    if len(HOF)==empty_list:
        HOF='0'
    else:
        HOF= HOF[0]
    bio= [height[0],weight[0],bday[0],All_Star_apps,All_NBA_apps,All_Def_apps,HOF]
    
    #Find Stats overview, then pull out individual stats then extract the stats and make them a list
    stats = soup.find('div', class_ = "stats_pullout")
    cells = stats.find_all('p')
    stats_list = list(map(lambda x : x.string, cells[3:23:2]))
    
    return bio + stats_list

In [7]:
info_list = [get_player_info(link) for link in df_eligible.Link]
#Very slow

In [8]:
df_info = pd.DataFrame(info_list, columns=['Height','Weight','Birthday','All_Star_apps','All_NBA_apps','All_Def_apps','HOF', 'Games', 'PPG', 'TRPG', 'APG', 'FG_pct', '3_pt_pct', 'FT_pct', 'eFG_pct', 'PER', 'WS'])
df_eligible = pd.concat([df_eligible, df_info], axis = 1)
df_eligible
#Concat the data about the players from info_list and the list of players

Unnamed: 0,index,Player,Link,StartYear,EndYear,Tenure,Height,Weight,Birthday,All_Star_apps,All_NBA_apps,All_Def_apps,HOF,Games,PPG,TRPG,APG,FG_pct,3_pt_pct,FT_pct,eFG_pct,PER,WS
0,1,Zaid Abdul-Aziz,/players/a/abdulza01.html,1969,1978,9,206cm,106kg,1946-04-07,0,0,0,0,505,9.0,8.0,1.2,42.8,72.8,15.1,17.5,,
1,2,Kareem Abdul-Jabbar,/players/a/abdulka01.html,1970,1989,19,218cm,102kg,1947-04-16,19,15,11,Hall of Fame,1560,24.6,11.2,3.6,55.9,5.6,72.1,55.9,24.6,273.4
2,3,Mahmoud Abdul-Rauf,/players/a/abdulma02.html,1991,2001,10,185cm,73kg,1969-03-09,0,0,0,0,586,14.6,1.9,3.5,44.2,35.4,90.5,47.2,15.4,25.2
3,4,Tariq Abdul-Wahad,/players/a/abdulta01.html,1998,2003,5,198cm,101kg,1974-11-03,0,0,0,0,236,7.8,3.3,1.1,41.7,23.7,70.3,42.2,11.4,3.5
4,5,Shareef Abdur-Rahim,/players/a/abdursh01.html,1997,2008,11,206cm,102kg,1976-12-11,1,0,0,0,830,18.1,7.5,2.5,47.2,29.7,81.0,47.9,19.0,71.2
5,12,Mark Acres,/players/a/acresma01.html,1988,1993,5,211cm,99kg,1962-11-15,0,0,0,0,375,3.6,4.1,0.5,50.6,53.8,66.5,50.9,9.0,9.3
6,15,Alvan Adams,/players/a/adamsal01.html,1976,1988,12,206cm,95kg,1954-07-19,1,0,0,0,988,14.1,7.0,4.1,49.8,13.3,78.8,49.8,18.3,73.5
7,16,Don Adams,/players/a/adamsdo01.html,1971,1977,6,198cm,95kg,1947-11-27,0,0,0,0,523,8.8,5.6,1.9,40.2,0.0,74.1,40.2,10.7,11.7
8,21,Michael Adams,/players/a/adamsmi01.html,1986,1996,10,178cm,73kg,1963-01-19,1,0,0,0,653,14.7,2.9,6.4,41.5,33.2,84.9,47.5,16.6,46.9
9,23,Rafael Addison,/players/a/addisra01.html,1987,1997,10,201cm,97kg,1964-07-22,0,0,0,0,379,5.8,2.1,0.9,44.9,28.2,77.2,46.6,10.5,4.9


In [9]:
df_eligible.shape

(1458, 23)

In [10]:
page = requests.get("https://www.basketball-reference.com/players/a/abdulka01.html")
soup = BeautifulSoup(page.content, 'html.parser')

In [11]:
stats = soup.find('div', class_ = "stats_pullout")

In [12]:
cells = stats.find_all('p')

In [13]:
cells

[<p><strong></strong></p>,
 <p><strong>Career</strong></p>,
 <p></p>,
 <p>1560</p>,
 <p></p>,
 <p>24.6</p>,
 <p></p>,
 <p>11.2</p>,
 <p></p>,
 <p>3.6</p>,
 <p></p>,
 <p>55.9</p>,
 <p></p>,
 <p>5.6</p>,
 <p></p>,
 <p>72.1</p>,
 <p></p>,
 <p>55.9</p>,
 <p></p>,
 <p>24.6</p>,
 <p></p>,
 <p>273.4</p>]

In [14]:
list(map(lambda x : x.string, cells[3:23:2]))

['1560', '24.6', '11.2', '3.6', '55.9', '5.6', '72.1', '55.9', '24.6', '273.4']

In [15]:
cells[3].string

'1560'

In [16]:
df_eligible

Unnamed: 0,index,Player,Link,StartYear,EndYear,Tenure,Height,Weight,Birthday,All_Star_apps,All_NBA_apps,All_Def_apps,HOF,Games,PPG,TRPG,APG,FG_pct,3_pt_pct,FT_pct,eFG_pct,PER,WS
0,1,Zaid Abdul-Aziz,/players/a/abdulza01.html,1969,1978,9,206cm,106kg,1946-04-07,0,0,0,0,505,9.0,8.0,1.2,42.8,72.8,15.1,17.5,,
1,2,Kareem Abdul-Jabbar,/players/a/abdulka01.html,1970,1989,19,218cm,102kg,1947-04-16,19,15,11,Hall of Fame,1560,24.6,11.2,3.6,55.9,5.6,72.1,55.9,24.6,273.4
2,3,Mahmoud Abdul-Rauf,/players/a/abdulma02.html,1991,2001,10,185cm,73kg,1969-03-09,0,0,0,0,586,14.6,1.9,3.5,44.2,35.4,90.5,47.2,15.4,25.2
3,4,Tariq Abdul-Wahad,/players/a/abdulta01.html,1998,2003,5,198cm,101kg,1974-11-03,0,0,0,0,236,7.8,3.3,1.1,41.7,23.7,70.3,42.2,11.4,3.5
4,5,Shareef Abdur-Rahim,/players/a/abdursh01.html,1997,2008,11,206cm,102kg,1976-12-11,1,0,0,0,830,18.1,7.5,2.5,47.2,29.7,81.0,47.9,19.0,71.2
5,12,Mark Acres,/players/a/acresma01.html,1988,1993,5,211cm,99kg,1962-11-15,0,0,0,0,375,3.6,4.1,0.5,50.6,53.8,66.5,50.9,9.0,9.3
6,15,Alvan Adams,/players/a/adamsal01.html,1976,1988,12,206cm,95kg,1954-07-19,1,0,0,0,988,14.1,7.0,4.1,49.8,13.3,78.8,49.8,18.3,73.5
7,16,Don Adams,/players/a/adamsdo01.html,1971,1977,6,198cm,95kg,1947-11-27,0,0,0,0,523,8.8,5.6,1.9,40.2,0.0,74.1,40.2,10.7,11.7
8,21,Michael Adams,/players/a/adamsmi01.html,1986,1996,10,178cm,73kg,1963-01-19,1,0,0,0,653,14.7,2.9,6.4,41.5,33.2,84.9,47.5,16.6,46.9
9,23,Rafael Addison,/players/a/addisra01.html,1987,1997,10,201cm,97kg,1964-07-22,0,0,0,0,379,5.8,2.1,0.9,44.9,28.2,77.2,46.6,10.5,4.9


In [17]:
df_eligible

Unnamed: 0,index,Player,Link,StartYear,EndYear,Tenure,Height,Weight,Birthday,All_Star_apps,All_NBA_apps,All_Def_apps,HOF,Games,PPG,TRPG,APG,FG_pct,3_pt_pct,FT_pct,eFG_pct,PER,WS
0,1,Zaid Abdul-Aziz,/players/a/abdulza01.html,1969,1978,9,206cm,106kg,1946-04-07,0,0,0,0,505,9.0,8.0,1.2,42.8,72.8,15.1,17.5,,
1,2,Kareem Abdul-Jabbar,/players/a/abdulka01.html,1970,1989,19,218cm,102kg,1947-04-16,19,15,11,Hall of Fame,1560,24.6,11.2,3.6,55.9,5.6,72.1,55.9,24.6,273.4
2,3,Mahmoud Abdul-Rauf,/players/a/abdulma02.html,1991,2001,10,185cm,73kg,1969-03-09,0,0,0,0,586,14.6,1.9,3.5,44.2,35.4,90.5,47.2,15.4,25.2
3,4,Tariq Abdul-Wahad,/players/a/abdulta01.html,1998,2003,5,198cm,101kg,1974-11-03,0,0,0,0,236,7.8,3.3,1.1,41.7,23.7,70.3,42.2,11.4,3.5
4,5,Shareef Abdur-Rahim,/players/a/abdursh01.html,1997,2008,11,206cm,102kg,1976-12-11,1,0,0,0,830,18.1,7.5,2.5,47.2,29.7,81.0,47.9,19.0,71.2
5,12,Mark Acres,/players/a/acresma01.html,1988,1993,5,211cm,99kg,1962-11-15,0,0,0,0,375,3.6,4.1,0.5,50.6,53.8,66.5,50.9,9.0,9.3
6,15,Alvan Adams,/players/a/adamsal01.html,1976,1988,12,206cm,95kg,1954-07-19,1,0,0,0,988,14.1,7.0,4.1,49.8,13.3,78.8,49.8,18.3,73.5
7,16,Don Adams,/players/a/adamsdo01.html,1971,1977,6,198cm,95kg,1947-11-27,0,0,0,0,523,8.8,5.6,1.9,40.2,0.0,74.1,40.2,10.7,11.7
8,21,Michael Adams,/players/a/adamsmi01.html,1986,1996,10,178cm,73kg,1963-01-19,1,0,0,0,653,14.7,2.9,6.4,41.5,33.2,84.9,47.5,16.6,46.9
9,23,Rafael Addison,/players/a/addisra01.html,1987,1997,10,201cm,97kg,1964-07-22,0,0,0,0,379,5.8,2.1,0.9,44.9,28.2,77.2,46.6,10.5,4.9
