# How to Use Selenium to Web-Scrape with Example
Scraping NBA Player Names and Salaries from Hoopshype.com Using Selenium

https://towardsdatascience.com/how-to-use-selenium-to-web-scrape-with-example-80f9b23a843a

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd

In [2]:
# access webdriver = - this will be used to automatically open browser to access website of choice
driver = webdriver.Chrome('C://Users//Rach//Documents//chromedriver//chromedriver')

In [3]:
# open the website you're attempting to scrape
driver.get('https://hoopshype.com/salaries/players/')

In [4]:
# to extra the information you're looking to scrape, you need to locate the element's XPath - this is a syntax used for finding any element on a webpage
# to locate the element's XPath, first highlight the first in the list of what you're looking for (in our case player names), right click, and select inspect.  Then highlight the second in the list and check for commonality - in this case <td class="name">
# this translates into an XPath as //td(@class="name")
players = driver.find_elements_by_xpath('//td[@class="name"]')

In [5]:
# populate a list will each player name
players_list = []
for p in range(len(players)):
    players_list.append(players[p].text)

In [6]:
# follow the same process to acquire the player salaries
salaries = driver.find_elements_by_xpath('//td[@class="hh-salaries-sorted"]')
salaries_list = []
for s in range(len(salaries)):
    salaries_list.append(salaries[s].text)

In [7]:
# create a master datagframe
df = pd.DataFrame(columns=['Player','Salary','Year'])

In [8]:
# often, when using Selenium, you'll be attempting to retrieve data that is located on multiple pages from the same website
# in our example, the difference between the URL of each season is just a matter of the years being included at the end
# we can therefore create a function that loops through each year and accesses each URL individually
for yr in range(1990,2019):
    page_num = str(yr) + '-' + str(yr+1) +'/'
    url = 'https://hoopshype.com/salaries/players/' + page_num
    driver.get(url)
    
    players = driver.find_elements_by_xpath('//td[@class="name"]')
    salaries = driver.find_elements_by_xpath('//td[@class="hh-salaries-sorted"]') 
    
    players_list = []
    for p in range(len(players)):
        players_list.append(players[p].text)
    
    salaries_list = []
    for s in range(len(salaries)):
        salaries_list.append(salaries[s].text)
    
    # pair each player with their salary for that season
    data_tuples = list(zip(players_list[1:],salaries_list[1:]))
    # place player and salary into a temporary dataframe, and add the year for this season
    temp_df = pd.DataFrame(data_tuples, columns=['Player','Salary']) 
    temp_df['Year'] = yr
    # append this year's season to the master dataframe
    df = df.append(temp_df) 
    
driver.close()

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=88.0.4324.104)


In [9]:
df.head()

Unnamed: 0,Player,Salary,Year
0,Patrick Ewing,"$4,250,000",1990
1,Hot Rod Williams,"$3,785,000",1990
2,Hakeem Olajuwon,"$3,175,000",1990
3,Charles Barkley,"$2,900,000",1990
4,Chris Mullin,"$2,850,000",1990


In [10]:
df.tail()

Unnamed: 0,Player,Salary,Year
571,Michael Frazier,"$47,370",2018
572,Tahjere McCall,"$47,370",2018
573,Mitchell Creek,"$47,370",2018
574,Isaac Humphries,"$47,370",2018
575,Jordan Sibert,"$47,370",2018
