# Notebook to webscrape and clean stats.nba.com information
### Information to gathter:
- Player Name
- NBA.com Player ID 
- NBA.com Player Page URL 
- Player Usage Stats from 2013-2020

### Libraries Utalized for webscraping
- Selenium

### Why Selenium instead BeautifulSoup?
The reason Selenium was used was because stats.nba.com site was created with a dynamic Javascript component which means that the sourcecode which BeautifulSoup scrapes from did not include the information needed but called into it via Javascript. This was the work around to get the additional information. 

In [2]:
# import libraries
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd

## 1. Gathering all names available on stats.nba.com

Using Selenium on the stats.nba.com player index, scrape all player names and player pages URLs including all historial players since the current view only shows active players and some atheles might have dropped out or came in during the current season. 


In [2]:
# specify the url
urlpage = 'https://stats.nba.com/players/list/?Historic=Y' 
print(urlpage)
# run firefox webdriver from executable path of your choice
driver = webdriver.Firefox()

https://stats.nba.com/players/list/?Historic=Y


In [3]:
# get web page
driver.get(urlpage)
# execute script to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
# sleep for 30s
time.sleep(30)
# driver.quit()

In [4]:
# find elements by xpath# at the player name level
results = driver.find_elements_by_xpath("//*[@class='stats-player-list players-list']//*[@class='row collapse players-list__section']//*[@class='players-list__names']//*[@class='players-list__name']")
#check number of results
print('Number of results', len(results))

Number of results 4503


In [5]:
# create empty array to store data
data = []
# loop over results to store name and url
for result in results:
    product_name = result.text
    player = result.find_element_by_tag_name('a')
    link = player.get_attribute("href")
    # append dict to array
    data.append({"player" : product_name, "link" : link})

In [7]:
# close driver 
driver.quit()

In [8]:
# save to pandas dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,player,link
0,"Abdelnaby, Alaa",https://stats.nba.com/player/76001/
1,"Abdul-Aziz, Zaid",https://stats.nba.com/player/76002/
2,"Abdul-Jabbar, Kareem",https://stats.nba.com/player/76003/
3,"Abdul-Rauf, Mahmoud",https://stats.nba.com/player/51/
4,"Abdul-Wahad, Tariq",https://stats.nba.com/player/1505/
...,...,...
4498,"Zizic, Ante",https://stats.nba.com/player/1627790/
4499,"Zoet, Jim",https://stats.nba.com/player/78647/
4500,"Zopf, Bill",https://stats.nba.com/player/78648/
4501,"Zubac, Ivica",https://stats.nba.com/player/1627826/


In [113]:
# save to csv
df.to_csv('stats_nba_links.csv')

In [None]:
# check length of URL to remove from player page url to get player id 
len('https://stats.nba.com/player/')

In [None]:
# get player id 
for x in df['stats_id']:
    df['stats_id'] = df['link'].str[29:-1]

In [13]:
df

Unnamed: 0,player,link,stats_id
0,"Abdelnaby, Alaa",https://stats.nba.com/player/76001/,76001
1,"Abdul-Aziz, Zaid",https://stats.nba.com/player/76002/,76002
2,"Abdul-Jabbar, Kareem",https://stats.nba.com/player/76003/,76003
3,"Abdul-Rauf, Mahmoud",https://stats.nba.com/player/51/,51
4,"Abdul-Wahad, Tariq",https://stats.nba.com/player/1505/,1505
...,...,...,...
4498,"Zizic, Ante",https://stats.nba.com/player/1627790/,1627790
4499,"Zoet, Jim",https://stats.nba.com/player/78647/,78647
4500,"Zopf, Bill",https://stats.nba.com/player/78648/,78648
4501,"Zubac, Ivica",https://stats.nba.com/player/1627826/,1627826


# 2. Match names to our current active player list

Because the active player list on basketball-reference.com and stat.nba.com are different. We need to do the following steps:

- Match the name format
- Merge the two csvs on name 

This cuts down from 4500 URLS to 1200. 

In [None]:
name = df["player"].str.split(" ", n = -1, expand = True)
for x in name[0]:
    name['name'] = name[1] + ' ' + name[0]
    df['player'] = name['name'].str[:-1]
df

In [None]:
# save update to csv
df.to_csv('stats_nba_links.csv')

In [None]:
# cull down number of players for the years we want to match
# load active list created from basketball-reference.com
import csv
path = 'Project-3/Resources/active_players.csv'
active_players = pd.read_csv(path)
active_players

In [None]:
active_stat_nba = pd.merge(df, active_players, left_on='player', right_on='name')
active_stat_nba = active_stat_nba.drop(columns=['Unnamed: 0','player','Lineup_name'])
active_stat_nba

## 3. Get individual player usage by season

Player usage URL format will looke like https://stats.nba.com/player/203507/usage/?Season=2015-16&SeasonType=Regular%20Season. 

We will need to pass the Player ID and the Season to get the last 7 years of usage. The two variable will determind the correct page to parse the data and scrape the data correctly for each user. 

In [14]:
#define the seasons needed in the format for the url
seasons = ['13-14','14-15','15-16','16-17','17-18','18-19','19-20']

In [None]:
# loop the player usage page per season to get the overall usage stats
all_usage = pd.DataFrame()
player_usage = pd.DataFrame()

for x in active_stat_nba['stats_id']:
    for y in seasons:
        urlpage = 'https://stats.nba.com/player/'+ str(x) + '/usage/?Season=20' + str(y) + '&SeasonType=Regular%20Season'
        driver = webdriver.Firefox()
        driver.get(urlpage)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        time.sleep(5)
        results = driver.find_elements_by_xpath("/html/body/main/div[2]/div/div/div[3]/div/div/div/nba-stat-table[1]/div[2]/div[1]/table/tbody/tr/td")
        usage = []
        for use in results:
            yearlyuse = use.text
            usage.append(yearlyuse)
        player_usage = pd.DataFrame([usage])
        player_usage['stats_id'] = x
        all_usage = all_usage.append(player_usage)
        driver.quit()

In [None]:
all_usage

In [None]:
all_usage.to_csv('all_usage.csv')

In [5]:
import csv
path = '../Resources/usage.csv'
active_usage = pd.read_csv(path)
active_usage

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,19,20,stats_id
0,0,2013-14,63.0,848.0,9.9,10.7,9.8,3.4,4.3,9.0,...,27.8,9.8,12.0,20.9,33.3,10.0,27.1,12.4,9.8,203112
1,0,2014-15,68.0,1287.0,15.1,16.9,15.2,9.0,10.5,20.9,...,26.9,12.4,14.4,12.4,18.6,10.3,26.5,22.6,16.8,203112
2,0,2015-16,59.0,876.0,12.8,16.7,13.5,13.3,11.8,19.2,...,23.5,6.5,8.5,16.3,28.9,6.5,28.1,20.6,16.8,203112
3,0,2016-17,38.0,558.0,16.5,17.1,17.7,29.1,24.9,17.2,...,24.3,7.9,11.4,15.4,32.6,10.0,25.8,22.8,18.4,203112
4,0,2017-18,70.0,1359.0,14.3,12.3,15.3,25.6,26.5,10.3,...,21.0,8.6,13.8,19.4,23.8,9.5,24.5,14.7,13.7,203112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7842,0,2019-20,54.0,966.0,16.6,21.1,15.8,0.0,0.3,23.2,...,37.8,11.8,16.3,6.6,47.4,32.9,29.4,20.0,19.1,1627826
7843,0,2016-17,38.0,609.0,20.1,24.8,21.4,0.0,1.0,13.4,...,29.9,10.9,17.2,13.5,51.6,34.3,27.4,21.4,21.1,1627826
7844,0,2017-18,43.0,410.0,17.2,18.9,16.3,0.0,0.4,30.7,...,28.8,13.2,19.1,13.3,34.9,24.0,30.1,26.9,18.5,1627826
7845,0,2018-19,59.0,1039.0,19.9,23.7,20.0,0.0,0.0,25.6,...,35.0,11.1,20.1,8.9,49.0,36.4,29.3,23.3,21.8,1627826


In [7]:
path = '../Resources/stats_nba_links.csv'
players_links = pd.read_csv(path)
players_links

Unnamed: 0.1,Unnamed: 0,player,link,stats_id
0,0,Alaa Abdelnaby,https://stats.nba.com/player/76001/,76001
1,1,Zaid Abdul-Aziz,https://stats.nba.com/player/76002/,76002
2,2,Kareem Abdul-Jabbar,https://stats.nba.com/player/76003/,76003
3,3,Mahmoud Abdul-Rauf,https://stats.nba.com/player/51/,51
4,4,Tariq Abdul-Wahad,https://stats.nba.com/player/1505/,1505
...,...,...,...,...
4498,4498,Ante Zizic,https://stats.nba.com/player/1627790/,1627790
4499,4499,Jim Zoet,https://stats.nba.com/player/78647/,78647
4500,4500,Bill Zopf,https://stats.nba.com/player/78648/,78648
4501,4501,Ivica Zubac,https://stats.nba.com/player/1627826/,1627826


In [8]:
active_usage = pd.merge(active_usage, players_links, left_on='stats_id', right_on='stats_id')
active_usage

Unnamed: 0,Unnamed: 0_x,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,stats_id,Unnamed: 0_y,player,link
0,0,2013-14,63.0,848.0,9.9,10.7,9.8,3.4,4.3,9.0,...,20.9,33.3,10.0,27.1,12.4,9.8,203112,14,Quincy Acy,https://stats.nba.com/player/203112/
1,0,2014-15,68.0,1287.0,15.1,16.9,15.2,9.0,10.5,20.9,...,12.4,18.6,10.3,26.5,22.6,16.8,203112,14,Quincy Acy,https://stats.nba.com/player/203112/
2,0,2015-16,59.0,876.0,12.8,16.7,13.5,13.3,11.8,19.2,...,16.3,28.9,6.5,28.1,20.6,16.8,203112,14,Quincy Acy,https://stats.nba.com/player/203112/
3,0,2016-17,38.0,558.0,16.5,17.1,17.7,29.1,24.9,17.2,...,15.4,32.6,10.0,25.8,22.8,18.4,203112,14,Quincy Acy,https://stats.nba.com/player/203112/
4,0,2017-18,70.0,1359.0,14.3,12.3,15.3,25.6,26.5,10.3,...,19.4,23.8,9.5,24.5,14.7,13.7,203112,14,Quincy Acy,https://stats.nba.com/player/203112/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7842,0,2019-20,54.0,966.0,16.6,21.1,15.8,0.0,0.3,23.2,...,6.6,47.4,32.9,29.4,20.0,19.1,1627826,4501,Ivica Zubac,https://stats.nba.com/player/1627826/
7843,0,2016-17,38.0,609.0,20.1,24.8,21.4,0.0,1.0,13.4,...,13.5,51.6,34.3,27.4,21.4,21.1,1627826,4501,Ivica Zubac,https://stats.nba.com/player/1627826/
7844,0,2017-18,43.0,410.0,17.2,18.9,16.3,0.0,0.4,30.7,...,13.3,34.9,24.0,30.1,26.9,18.5,1627826,4501,Ivica Zubac,https://stats.nba.com/player/1627826/
7845,0,2018-19,59.0,1039.0,19.9,23.7,20.0,0.0,0.0,25.6,...,8.9,49.0,36.4,29.3,23.3,21.8,1627826,4501,Ivica Zubac,https://stats.nba.com/player/1627826/


In [9]:
active_usage = active_usage.drop(columns=['Unnamed: 0_y', 'Unnamed: 0_x'])

In [10]:
active_usage

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,stats_id,player,link
0,2013-14,63.0,848.0,9.9,10.7,9.8,3.4,4.3,9.0,10.3,...,12.0,20.9,33.3,10.0,27.1,12.4,9.8,203112,Quincy Acy,https://stats.nba.com/player/203112/
1,2014-15,68.0,1287.0,15.1,16.9,15.2,9.0,10.5,20.9,20.6,...,14.4,12.4,18.6,10.3,26.5,22.6,16.8,203112,Quincy Acy,https://stats.nba.com/player/203112/
2,2015-16,59.0,876.0,12.8,16.7,13.5,13.3,11.8,19.2,18.3,...,8.5,16.3,28.9,6.5,28.1,20.6,16.8,203112,Quincy Acy,https://stats.nba.com/player/203112/
3,2016-17,38.0,558.0,16.5,17.1,17.7,29.1,24.9,17.2,18.6,...,11.4,15.4,32.6,10.0,25.8,22.8,18.4,203112,Quincy Acy,https://stats.nba.com/player/203112/
4,2017-18,70.0,1359.0,14.3,12.3,15.3,25.6,26.5,10.3,10.0,...,13.8,19.4,23.8,9.5,24.5,14.7,13.7,203112,Quincy Acy,https://stats.nba.com/player/203112/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7842,2019-20,54.0,966.0,16.6,21.1,15.8,0.0,0.3,23.2,26.0,...,16.3,6.6,47.4,32.9,29.4,20.0,19.1,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/
7843,2016-17,38.0,609.0,20.1,24.8,21.4,0.0,1.0,13.4,16.3,...,17.2,13.5,51.6,34.3,27.4,21.4,21.1,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/
7844,2017-18,43.0,410.0,17.2,18.9,16.3,0.0,0.4,30.7,30.2,...,19.1,13.3,34.9,24.0,30.1,26.9,18.5,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/
7845,2018-19,59.0,1039.0,19.9,23.7,20.0,0.0,0.0,25.6,24.0,...,20.1,8.9,49.0,36.4,29.3,23.3,21.8,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/


In [14]:
active_usage.columns = ['season','GP','MIN','USG%','%FGM','%FGA','%3PM','%3PA','%FTM','%FTA','%OREB','%DREB','%REB','%AST','%TOV','%STL','%BLK','%BLKA','%PF','%PFD','%PTS','stats_id', 'player','link']


In [15]:
active_usage

Unnamed: 0,season,GP,MIN,USG%,%FGM,%FGA,%3PM,%3PA,%FTM,%FTA,...,%TOV,%STL,%BLK,%BLKA,%PF,%PFD,%PTS,stats_id,player,link
0,2013-14,63.0,848.0,9.9,10.7,9.8,3.4,4.3,9.0,10.3,...,12.0,20.9,33.3,10.0,27.1,12.4,9.8,203112,Quincy Acy,https://stats.nba.com/player/203112/
1,2014-15,68.0,1287.0,15.1,16.9,15.2,9.0,10.5,20.9,20.6,...,14.4,12.4,18.6,10.3,26.5,22.6,16.8,203112,Quincy Acy,https://stats.nba.com/player/203112/
2,2015-16,59.0,876.0,12.8,16.7,13.5,13.3,11.8,19.2,18.3,...,8.5,16.3,28.9,6.5,28.1,20.6,16.8,203112,Quincy Acy,https://stats.nba.com/player/203112/
3,2016-17,38.0,558.0,16.5,17.1,17.7,29.1,24.9,17.2,18.6,...,11.4,15.4,32.6,10.0,25.8,22.8,18.4,203112,Quincy Acy,https://stats.nba.com/player/203112/
4,2017-18,70.0,1359.0,14.3,12.3,15.3,25.6,26.5,10.3,10.0,...,13.8,19.4,23.8,9.5,24.5,14.7,13.7,203112,Quincy Acy,https://stats.nba.com/player/203112/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7842,2019-20,54.0,966.0,16.6,21.1,15.8,0.0,0.3,23.2,26.0,...,16.3,6.6,47.4,32.9,29.4,20.0,19.1,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/
7843,2016-17,38.0,609.0,20.1,24.8,21.4,0.0,1.0,13.4,16.3,...,17.2,13.5,51.6,34.3,27.4,21.4,21.1,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/
7844,2017-18,43.0,410.0,17.2,18.9,16.3,0.0,0.4,30.7,30.2,...,19.1,13.3,34.9,24.0,30.1,26.9,18.5,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/
7845,2018-19,59.0,1039.0,19.9,23.7,20.0,0.0,0.0,25.6,24.0,...,20.1,8.9,49.0,36.4,29.3,23.3,21.8,1627826,Ivica Zubac,https://stats.nba.com/player/1627826/


In [16]:
active_usage.to_csv('all_usage.csv')