In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
import time
sns.set_style("whitegrid")
sns.set_context("poster")
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from string import ascii_lowercase
import os

This opens a browser that selenium can control (ie: here we'll be loading pages) to scrape websites.  Here we are using Safari, but you can also use Chrome, FireFox, and a few other browsers

In [7]:
os.environ["SELENIUM_SERVER_JAR"] = "selenium-server-standalone-2.48.0.jar"
browser = webdriver.Safari()

## Functions for Scraping  

These two functions were written to help clean up the code for scraping.  

table_fill: takes all the data from one of the desired tables and replaces empty entries with 'NA'.  This will be helpful later on to make sure the tables merge together seamlessly.

table_fill: Iterates through the table and returns a list of lists that perfectly match the size of the header

In [8]:
def table_fill(elements):
    vals = []
    for elm in elements:
        if elm.text =="":
            vals.append('NA')
        else:
            vals.append(elm.text)
    return vals

def table_to_list(statstable,name,headers):
    headLen = len(headers)-1
    new_year = []
    if headLen > 0:
        years = [statstable[x:x+headLen] for x in xrange(0,len(statstable),headLen)]
        new_year = [] 
        for year in years:
            year = [name] + year
            new_year.append(year)
    return new_year


#Get Ex-Player Stats

This works by iterating through each letter in the alphabet and going to the respective page for players whose last name starts with that letter.  Selenium loads the link and then gets the names of the players and the links to their respective stats page.  We then iterated through the players names, went to their stats pge, and scraped the data from their 'total' table.  We add each row to the dict list and then turn that into a DataFrame once we have collected data from all the players.

Two notes:
Because of the way basketball-reference is structured, we have to scrape current players (bolded) and Ex-Players (not bolded) separately.  This section is for Ex-Players and we'll show how to do the other players later in the notebook.

Additionally, depending on whether Selenium wants to work well or not, you may have to split the alphabet into a few different groups, run this loop a few times, and merge the DataFrames later.  We had to do that and will describe how to merge them later in the Notebook.

In [14]:
link = 'http://www.basketball-reference.com/players/%s/'
dict_list = []
for letter in ascii_lowercase:
    cur_link = link % letter
    browser.get(cur_link)
    names = [elm.text for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/a")]
    listplayers = [elm.get_attribute('href') for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/a")]
    for i in range(len(names)):
        browser.get(listplayers[i])
        name = names[i]
        headers = ['Player'] + [elm.text for elm in browser.find_elements_by_xpath('//*[@id="totals"]/thead/tr/th')]
        table = browser.find_elements_by_xpath('//*[@id="totals"]/tbody/tr/td')
        vals = table_fill(table)
        val_list = table_to_list(vals,name,headers)
        for val in val_list:
            dict_list.append(dict(zip(headers,val)))
stats = pd.DataFrame(dict_list)
new_stats = stats.set_index(stats.Player)
new_stats.to_csv('aiData.csv',encoding='utf-8')

# Get Ex-Player Salary Data

This works very similarly to the previous section, but we are using a different xpath to get the data from the salary data. Again, you may have to split up the alphabet, run it in chunks, and merge the DataFrames.  

In [15]:
link = 'http://www.basketball-reference.com/players/%s/'
dict_list = []
for letter in ascii_lowercase:
    cur_link = link % letter
    browser.get(cur_link)
    names = [elm.text for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/a")]
    listplayers = [elm.get_attribute('href') for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/a")]
    for i in range(len(names)):
        browser.get(listplayers[i])
        name = names[i]
        headers = ['Player'] + [elm.text for elm in browser.find_elements_by_xpath('//*[@id="salaries"]/thead/tr/th')]
        headers = headers[0:2] + [headers[-1]]
        vals = [[name] + elm.text.split() for elm in browser.find_elements_by_xpath('//*[@id="salaries"]/tbody/tr')]
        if headers != [] and vals != []:
            for val in vals:
                val = val[0:2] + [val[-1]]
                dict_list.append(dict(zip(headers,val)))
income = pd.DataFrame(dict_list)
new_income = income.set_index(income.Player)
new_income.to_csv('Income.csv',encoding='utf-8')

# Get Ex-Player College Stats

Next we got the college data for all the Ex-Players.

In [16]:
link = 'http://www.basketball-reference.com/players/%s/'
dict_list = []
for letter in ascii_lowercase:
    cur_link = link % letter
    browser.get(cur_link)
    names = [elm.text for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/a")]
    listplayers = [elm.get_attribute('href') for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/a")]
    for i in range(len(names)):
        browser.get(listplayers[i])
        name = names[i]
        result = browser.find_elements_by_xpath('//*[@id="college"]/tbody/tr/td')
        headers = ['Player'] + [elm.text for elm in browser.find_elements_by_xpath('//*[@id="college"]/thead/tr[2]/th')]
        vals = table_fill(result)
        if headers != [] and vals != []:
            val_list = table_to_list(vals,name,headers)
            for elm in val_list:
                dict_list.append(dict(zip(headers,elm)))

college_stats = pd.DataFrame(dict_list)
new_college_stats = college_stats.set_index(college_stats.Player)
new_college_stats.to_csv('PastCollegeData.csv',encoding='utf-8')

# Get Current Player Stats

Similar to Ex-Player Stats.  The main difference is that we use the xpath that selects "strong" aspects in the html, which means bolded text.  Thus, we can rerun the old code with the new list of names and links to get all the current player stats.

In [18]:
link = 'http://www.basketball-reference.com/players/%s/'
dict_list = []
for letter in ascii_lowercase:
    cur_link = link % letter
    browser.get(cur_link)
    bold_names = [elm.text for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/strong/a")]
    listplayersbold = [elm.get_attribute('href') for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/strong/a")]
    for i in range(len(bold_names)):
        browser.get(listplayersbold[i])
        name = bold_names[i]
        headers = ['Player'] + [elm.text for elm in browser.find_elements_by_xpath('//*[@id="totals"]/thead/tr/th')]
        table = browser.find_elements_by_xpath('//*[@id="totals"]/tbody/tr/td')
        vals = table_fill(table)
        val_list = table_to_list(vals,name,headers)
        for val in val_list:
            dict_list.append(dict(zip(headers,val)))

cur_stats = pd.DataFrame(dict_list)
new_cur_stats = stats.set_index(stats.Player)
new_cur_stats.to_csv('CurData.csv',encoding='utf-8')

# Get Current Player Salary Data

In [19]:
link = 'http://www.basketball-reference.com/players/%s/'
dict_list = []
for letter in ascii_lowercase:
    cur_link = link % letter
    browser.get(cur_link)
    bold_names = [elm.text for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/strong/a")]
    listplayersbold = [elm.get_attribute('href') for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/strong/a")]
    for i in range(len(bold_names)):
        browser.get(listplayersbold[i])
        name = bold_names[i]
        headers = ['Player'] + [elm.text for elm in browser.find_elements_by_xpath('//*[@id="salaries"]/thead/tr/th')]
        headers = headers[0:2] + [headers[-1]]
        vals = [[name] + elm.text.split() for elm in browser.find_elements_by_xpath('//*[@id="salaries"]/tbody/tr')]
        if headers != [] and vals != []:
            for val in vals:
                val = val[0:2] + [val[-1]]
                dict_list.append(dict(zip(headers,val)))
income = pd.DataFrame(dict_list)
new_income = income.set_index(income.Player)
new_income.to_csv('CurPlayerIncome.csv',encoding='utf-8')

# Get Current Player College Stats

In [20]:
link = 'http://www.basketball-reference.com/players/%s/'
dict_list = []
for letter in ascii_lowercase:
    cur_link = link % letter
    browser.get(cur_link)
    bold_names = [elm.text for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/strong/a")]
    listplayersbold = [elm.get_attribute('href') for elm in browser.find_elements_by_xpath("//*[@id='players']/tbody/tr/td[1]/strong/a")]
    for i in range(len(bold_names)):
        browser.get(listplayersbold[i])
        name = bold_names[i]
        result = browser.find_elements_by_xpath('//*[@id="college"]/tbody/tr/td')
        headers = ['Player'] + [elm.text for elm in browser.find_elements_by_xpath('//*[@id="college"]/thead/tr[2]/th')]
        vals = table_fill(result)
        if headers != [] and vals != []:
            val_list = table_to_list(vals,name,headers)
            for elm in val_list:
                dict_list.append(dict(zip(headers,elm)))

college_stats = pd.DataFrame(dict_list)
new_college_stats = college_stats.set_index(college_stats.Player)
new_college_stats.to_csv('PastCollegeData.csv',encoding='utf-8')