In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np
import re
import ast
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn import tree
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', 100)
pd.options.display.max_rows = 4000

In [2]:
#Letters to iterate through
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']
#Empty lists to store our data
names_list = []
links_list = []
start_list = []
end_list = []
pos_list = []

In [3]:
for letter in alphabet:
    #Link to webpage using requests and BeautifulSoup
    page = requests.get('https://www.basketball-reference.com/players/' + letter)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #Select The table then the rows
    table = soup.find('table', class_ = 'sortable stats_table')
    rows = table.findAll('tr')
    
    #Iterate through the rows and get all player names
    new_names_list = [row.find('th') for row in rows]
    new_names_list = [name.find('a') for name in new_names_list]
    new_names_list = [name for name in new_names_list if name]
    new_names_list = [name.string for name in new_names_list]
    
    #Iterate through the rows and get links to all player pages
    new_links_list = [row.find('th') for row in rows]
    new_links_list = [link.find('a') for link in new_links_list]
    new_links_list = [link for link in new_links_list if link]
    new_links_list = [link['href'] for link in new_links_list]
    
    #Iterate and get first year of a players career
    new_start_list = [row.findAll('td') for row in rows]
    new_start_list = [year for year in new_start_list if year]
    new_start_list = [year[0].string for year in new_start_list]
    
    #Iterate and get last year
    new_end_list = [row.findAll('td') for row in rows]
    new_end_list = [year for year in new_end_list if year]
    new_end_list = [year[1].string for year in new_end_list]
    
    #Iterate and get position
    new_pos_list = [row.findAll('td') for row in rows]
    new_pos_list = [pos for pos in new_pos_list if pos]
    new_pos_list = [pos[2].string.replace('-','') for pos in new_pos_list]
    
    #Add the new data to our lists
    names_list = names_list + new_names_list
    links_list = links_list + new_links_list
    start_list = start_list + new_start_list
    end_list = end_list + new_end_list
    pos_list = pos_list + new_pos_list
    
    
#Turn our lists into a dataframe and name the columns    
df = pd.DataFrame([names_list, links_list, start_list, end_list, pos_list]).transpose()
df.columns = ['Player', 'Link', 'StartYear', 'EndYear', 'Position']

In [4]:
#Change data types of years to int64 and creating a column for amount of time in league
df.StartYear = df.StartYear.astype('int64')
df.EndYear = df.EndYear.astype('int64')
df['Tenure'] = df.EndYear - df.StartYear

In [5]:
df_eligible = df[(df.EndYear <= 2016) & (df.Tenure > 4)]
df_eligible.reset_index(inplace = True, drop = True)
#Limiting our data to players who retired in 2016 or earlier so that all players are hall of fame eligible
#Also removed any player who didn't last more than 4 seasons

In [6]:
def get_player_info(href1):
    page = requests.get('https://www.basketball-reference.com'+str(href1))
    #Name of Player
    #soup = BS(page.content, 'html.parser')
    #table_body=soup.find_all(itemprop= "name")
    #table_body
    #name= re.findall('<h1 itemprop=\"name\">(.*)</h1>', str(table_body))
    #if len(name)==0:
    #    name= re.findall('<span>(.*)</span>', str(table_body))
    #else:
    #    pass
    
    #Bio
    soup = BeautifulSoup(page.content, 'html.parser')
    table_body=soup.find_all('p')
    work=table_body[0:10]
    #print(work)
    height= re.findall('(\d*cm)', str(work))
    weight= re.findall('(\d*kg)', str(work))
    bday= re.findall('data-birth=\"(\d*\-\d*\-\d*)', str(work))
    #accolades
    accolades_body=soup.find(id="bling")
    All_Star_apps= re.findall('(\d*)x All Star', str(accolades_body))
    All_NBA_apps= re.findall('(\d*)x All-NBA', str(accolades_body))
    All_Def_apps= re.findall('(\d*)x All-Defensive', str(accolades_body))
    HOF= re.findall('(Hall of Fame)', str(accolades_body))
    empty_list=0
    if len(All_Star_apps)==empty_list:
        All_Star_apps='0'
    else:
        All_Star_apps=All_Star_apps[0]
    if len(All_NBA_apps)==empty_list:
        All_NBA_apps='0'
    else:
        All_NBA_apps=All_NBA_apps[0]
    if len(All_Def_apps)==empty_list:
        All_Def_apps='0'
    else:
        All_Def_apps= All_Def_apps[0]
    if len(HOF)==empty_list:
        HOF='0'
    else:
        HOF= HOF[0]
    bio= [height[0],weight[0],bday[0],All_Star_apps,All_NBA_apps,All_Def_apps,HOF]
    
    #Find Stats overview, then pull out individual stats then extract the stats and make them a list
    stats = soup.find('div', class_ = "stats_pullout")
    cells = stats.find_all('p')
    stats_list = list(map(lambda x : x.string, cells[3:23:2]))
    
    return bio + stats_list

In [7]:
info_list = [get_player_info(link) for link in df_eligible.Link]
#Very Slow

In [8]:
df_info = pd.DataFrame(info_list, columns=['Height','Weight','Birthday','All_Star_apps','All_NBA_apps','All_Def_apps','HOF', 'Games', 'PPG', 'TRPG', 'APG', 'FG_pct', '3_pt_pct', 'FT_pct', 'eFG_pct', 'PER', 'WS'])
df_eligible = pd.concat([df_eligible, df_info], axis = 1)
#Add new columns to our dataframe

In [9]:
df_eligible.to_csv('bbal_scraped_data.csv')
#export dataframe to a csv