## Web Scrapping using BeautifulSoup Package

##### url : https://www.basketball-reference.com/leagues/NBA_2019_per_game.html

In [6]:
#Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re  #For Manipulating Regular Expression

In [8]:
#Open the url using urlopen function
url = "https://www.basketball-reference.com/leagues/NBA_2019_per_game.html" #Open URL
html = urlopen(url) 

In [9]:
#Object Creation (Soup)
soup = BeautifulSoup(html)     #All website (html) information stored in soup object

In [10]:
#To Display the title of the website
title = soup.title
print(title)       #It shows the title with tag
print(title.text)  #It shows the title alone

<title>2018-19 NBA Player Stats: Per Game | Basketball-Reference.com</title>
2018-19 NBA Player Stats: Per Game | Basketball-Reference.com


In [108]:
#Look in the children of this PageElement and find all PageElements that match the given criteria.
PageElements that match the given criteria.
links = soup.find_all('a')     #A filter on tag name.
print(links)

[<a href="https://www.sports-reference.com/"><svg height="15px" width="20px"><use xlink:href="#ic-sr-pennant"></use></svg> Sports Reference</a>, <a href="https://www.baseball-reference.com/">Baseball</a>, <a href="https://www.pro-football-reference.com/">Football</a>, <a href="https://www.sports-reference.com/cfb/">(college)</a>, <a href="https://www.basketball-reference.com/">Basketball</a>, <a href="https://www.sports-reference.com/cbb/">(college)</a>, <a href="https://www.hockey-reference.com/">Hockey</a>, <a href="https://fbref.com/en/">Soccer</a>, <a href="https://www.sports-reference.com/blog/">Blog</a>, <a href="https://stathead.com/?utm_source=web&amp;utm_medium=bbr&amp;utm_campaign=sr-nav-bar-top-link">Stathead</a>, <a href="https://widgets.sports-reference.com/">Widgets</a>, <a href="#" onclick="FreshworksWidget('open'); return false;">Questions or Comments?</a>, <a href="https://stathead.com/profile/?utm_source=web&amp;utm_medium=bbr&amp;utm_campaign=sr-nav-bar-top-account">

In [13]:
#To return the href links of the current webpage by using find_all function
links = soup.find_all('a',href = True)
for link in links:
    print(link['href'])

https://www.sports-reference.com/
https://www.baseball-reference.com/
https://www.pro-football-reference.com/
https://www.sports-reference.com/cfb/
https://www.basketball-reference.com/
https://www.sports-reference.com/cbb/
https://www.hockey-reference.com/
https://fbref.com/en/
https://www.sports-reference.com/blog/
https://stathead.com/?utm_source=web&utm_medium=bbr&utm_campaign=sr-nav-bar-top-link
https://widgets.sports-reference.com/
#
https://stathead.com/profile/?utm_source=web&utm_medium=bbr&utm_campaign=sr-nav-bar-top-account
https://stathead.com/profile/?do=logout
https://stathead.com/users/login.cgi?token=1
https://stathead.com/users/signup.cgi
/
#site_menu_link
/players/
/teams/
/leagues/
/leaders/
/boxscores/
/playoffs/
/draft/
https://stathead.com/basketball/
https://stathead.com/newsletter.cgi?site_sub=bbr
#site_menu_link
/
/leagues/
/leagues/NBA_2019.html
https://stathead.com/profile/?utm_source=web&utm_medium=bbr&utm_campaign=sr-nav-bar-top-account
https://stathead.com/

/contracts/
/contracts/
/contracts/ATL.html
/contracts/players.html
/contracts/glossary.html
/playoffs/
/playoffs/NBA_2020.html
/playoffs/NBA_2019.html
/playoffs/NBA_2018.html
/playoffs/NBA_2017.html
/playoffs/
/allstar/
/allstar/NBA_2021.html
/allstar/NBA_2020.html
/allstar/NBA_2019.html
/allstar/NBA_2018.html
/draft/
/draft/NBA_2020.html
/draft/NBA_2019.html
/draft/NBA_2018.html
/draft/NBA_2017.html
/draft/NBA_2016.html
/friv/
/friv/birthdays.fcgi
/friv/colleges.fcgi
/friv/high_schools.fcgi
/friv/milestones.fcgi
/executives/
/executives/buforrc99x.html
/executives/embrywa01x.html
/executives/kastest99x.html
/executives/aingeda01x.html
/executives/nelsodo01x.html
/referees/
/referees/fortejo99r.html
/referees/brothto99r.html
/referees/crawfda99r.html
/referees/olesiro99r.html
/referees/jonesda99r.html
/gleague/
/gleague/players/
/gleague/teams/
/gleague/years/
/gleague/leaders/
/gleague/awards/
/international/
/international/players/
/international/teams/
/international/years/
/intern

In [14]:
#Lookin for each row using 'tr' tag
allrows = soup.find_all('tr')
print(allrows)

[<tr>
<th aria-label="Rank" class="ranker poptip sort_default_asc show_partial_when_sorting center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th>
<th aria-label="Player" class="poptip sort_default_asc center" data-stat="player" scope="col">Player</th>
<th aria-label="Position" class="poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>
<th aria-label="Player's age on February 1 of the season" class="poptip sort_default_asc center" data-stat="age" data-tip="Player's age on February 1 of the season" scope="col">Age</th>
<th aria-label="Team" class="poptip sort_default_asc center" data-stat="team_id" data-tip="Team" scope="col">Tm</th>
<th aria-label="Games" class="poptip center" data-stat="g" data-tip="Games" scope="col">G</th>
<th aria-label="Games Started" class="poptip center" data-stat="gs" data-tip="Games Started" scope="col">GS</th>
<th aria-label="Minutes Played Per Game" class="poptip hide_non_quals center" data-stat="mp_per_g" data-ti

In [15]:
allrows = soup.find_all('tr')
print(allrows[1])   #This line will print first row of the webpage

<tr class="full_table"><th class="right" csk="1" data-stat="ranker" scope="row">1</th><td class="left" csk="Abrines,Álex" data-append-csv="abrinal01" data-stat="player"><a href="/players/a/abrinal01.html">Álex Abrines</a></td><td class="center" data-stat="pos">SG</td><td class="right" data-stat="age">25</td><td class="left" data-stat="team_id"><a href="/teams/OKC/2019.html">OKC</a></td><td class="right" data-stat="g">31</td><td class="right" data-stat="gs">2</td><td class="right non_qual" data-stat="mp_per_g">19.0</td><td class="right non_qual" data-stat="fg_per_g">1.8</td><td class="right non_qual" data-stat="fga_per_g">5.1</td><td class="right non_qual" data-stat="fg_pct">.357</td><td class="right non_qual" data-stat="fg3_per_g">1.3</td><td class="right non_qual" data-stat="fg3a_per_g">4.1</td><td class="right non_qual" data-stat="fg3_pct">.323</td><td class="right non_qual" data-stat="fg2_per_g">0.5</td><td class="right non_qual" data-stat="fg2a_per_g">1.0</td><td class="right non_q

In [17]:
#To print the last row of the webpage
allrows = soup.find_all('tr')
for row in allrows:
    row_list = row.find_all('td')
print(row_list)

[<td class="left" csk="Zubac,Ivica" data-append-csv="zubaciv01" data-stat="player"><a href="/players/z/zubaciv01.html">Ivica Zubac</a></td>, <td class="center" data-stat="pos">C</td>, <td class="right" data-stat="age">21</td>, <td class="left" data-stat="team_id"><a href="/teams/LAC/2019.html">LAC</a></td>, <td class="right" data-stat="g">26</td>, <td class="right" data-stat="gs">25</td>, <td class="right non_qual" data-stat="mp_per_g">20.2</td>, <td class="right non_qual" data-stat="fg_per_g">3.8</td>, <td class="right non_qual" data-stat="fga_per_g">7.2</td>, <td class="right non_qual" data-stat="fg_pct">.538</td>, <td class="right non_qual iz" data-stat="fg3_per_g">0.0</td>, <td class="right non_qual iz" data-stat="fg3a_per_g">0.0</td>, <td class="right non_qual iz" data-stat="fg3_pct"></td>, <td class="right non_qual" data-stat="fg2_per_g">3.8</td>, <td class="right non_qual" data-stat="fg2a_per_g">7.2</td>, <td class="right non_qual" data-stat="fg2_pct">.538</td>, <td class="right

In [113]:
#To print the last row in the text format
for cell in row_list:
    print(cell.text)

Ivica Zubac
C
21
LAC
26
25
20.2
3.8
7.2
.538
0.0
0.0

3.8
7.2
.538
.538
1.7
2.3
.733
2.3
5.3
7.7
1.5
0.4
0.9
1.4
2.5
9.4


In [114]:
#To print all the data of the webpage using cell command
data = []
allrows = soup.find_all("tr")
for row in allrows:
    row_list = row.find_all("td")
    dataRow = []
    for cell in row_list:
        dataRow.append(cell.text)
    data.append(dataRow)
print(data)

[[], ['Álex Abrines', 'SG', '25', 'OKC', '31', '2', '19.0', '1.8', '5.1', '.357', '1.3', '4.1', '.323', '0.5', '1.0', '.500', '.487', '0.4', '0.4', '.923', '0.2', '1.4', '1.5', '0.6', '0.5', '0.2', '0.5', '1.7', '5.3'], ['Quincy Acy', 'PF', '28', 'PHO', '10', '0', '12.3', '0.4', '1.8', '.222', '0.2', '1.5', '.133', '0.2', '0.3', '.667', '.278', '0.7', '1.0', '.700', '0.3', '2.2', '2.5', '0.8', '0.1', '0.4', '0.4', '2.4', '1.7'], ['Jaylen Adams', 'PG', '22', 'ATL', '34', '1', '12.6', '1.1', '3.2', '.345', '0.7', '2.2', '.338', '0.4', '1.1', '.361', '.459', '0.2', '0.3', '.778', '0.3', '1.4', '1.8', '1.9', '0.4', '0.1', '0.8', '1.3', '3.2'], ['Steven Adams', 'C', '25', 'OKC', '80', '80', '33.4', '6.0', '10.1', '.595', '0.0', '0.0', '.000', '6.0', '10.1', '.596', '.595', '1.8', '3.7', '.500', '4.9', '4.6', '9.5', '1.6', '1.5', '1.0', '1.7', '2.6', '13.9'], ['Bam Adebayo', 'C', '21', 'MIA', '82', '28', '23.3', '3.4', '5.9', '.576', '0.0', '0.2', '.200', '3.4', '5.7', '.588', '.579', '2.0',

In [21]:
data = []
allrows = soup.find_all("tr")
for row in allrows:
    row_list = row.find_all("td")
    dataRow = []
    for cell in row_list:
        dataRow.append(cell.text)
    data.append(dataRow)

data = data[1:]
print(data[-2:])

[['Ivica Zubac', 'C', '21', 'LAL', '33', '12', '15.6', '3.4', '5.8', '.580', '0.0', '0.0', '', '3.4', '5.8', '.580', '.580', '1.7', '2.0', '.864', '1.6', '3.3', '4.9', '0.8', '0.1', '0.8', '1.0', '2.2', '8.5'], ['Ivica Zubac', 'C', '21', 'LAC', '26', '25', '20.2', '3.8', '7.2', '.538', '0.0', '0.0', '', '3.8', '7.2', '.538', '.538', '1.7', '2.3', '.733', '2.3', '5.3', '7.7', '1.5', '0.4', '0.9', '1.4', '2.5', '9.4']]


In [23]:
data = []
allrows = soup.find_all("tr")
for row in allrows:
    row_list = row.find_all("td")
    dataRow = []
    for cell in row_list:
        dataRow.append(cell.text)
    data.append(dataRow)

    
data = data[1:]
print(data[-2:])  #To print the first two line of the webpage

[['Ivica Zubac', 'C', '21', 'LAL', '33', '12', '15.6', '3.4', '5.8', '.580', '0.0', '0.0', '', '3.4', '5.8', '.580', '.580', '1.7', '2.0', '.864', '1.6', '3.3', '4.9', '0.8', '0.1', '0.8', '1.0', '2.2', '8.5'], ['Ivica Zubac', 'C', '21', 'LAC', '26', '25', '20.2', '3.8', '7.2', '.538', '0.0', '0.0', '', '3.8', '7.2', '.538', '.538', '1.7', '2.3', '.733', '2.3', '5.3', '7.7', '1.5', '0.4', '0.9', '1.4', '2.5', '9.4']]


In [76]:
# Converting the data to dataframe
nba = pd.DataFrame(data,index = None)

In [77]:
#Converting dataframe to excel sheet
nba.to_excel('nba_data.xlsx')

In [78]:
#to print the top of the dataframe
nba.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,0.357,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,0.222,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,0.345,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,0.595,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,0.576,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 698 entries, 0 to 733
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       675 non-null    object
 1   1       675 non-null    object
 2   2       675 non-null    object
 3   3       675 non-null    object
 4   4       675 non-null    object
 5   5       675 non-null    object
 6   6       675 non-null    object
 7   7       675 non-null    object
 8   8       675 non-null    object
 9   9       675 non-null    object
 10  10      675 non-null    object
 11  11      675 non-null    object
 12  12      675 non-null    object
 13  13      675 non-null    object
 14  14      675 non-null    object
 15  15      675 non-null    object
 16  16      675 non-null    object
 17  17      675 non-null    object
 18  18      675 non-null    object
 19  19      675 non-null    object
 20  20      675 non-null    object
 21  21      675 non-null    object
 22  22      675 non-null    ob

In [56]:
df.columns

RangeIndex(start=0, stop=29, step=1)

In [62]:
#Extract column headers
col_headers = soup.find_all('th')
print(col_headers)

[<th aria-label="Rank" class="ranker poptip sort_default_asc show_partial_when_sorting center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th>, <th aria-label="Player" class="poptip sort_default_asc center" data-stat="player" scope="col">Player</th>, <th aria-label="Position" class="poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>, <th aria-label="Player's age on February 1 of the season" class="poptip sort_default_asc center" data-stat="age" data-tip="Player's age on February 1 of the season" scope="col">Age</th>, <th aria-label="Team" class="poptip sort_default_asc center" data-stat="team_id" data-tip="Team" scope="col">Tm</th>, <th aria-label="Games" class="poptip center" data-stat="g" data-tip="Games" scope="col">G</th>, <th aria-label="Games Started" class="poptip center" data-stat="gs" data-tip="Games Started" scope="col">GS</th>, <th aria-label="Minutes Played Per Game" class="poptip hide_non_quals center" data-stat="mp_per_g" data-

In [63]:
col = pd.DataFrame(col_headers,index=None)
col2 = col.head(30)
print(col2)

         0
0       Rk
1   Player
2      Pos
3      Age
4       Tm
5        G
6       GS
7       MP
8       FG
9      FGA
10     FG%
11      3P
12     3PA
13     3P%
14      2P
15     2PA
16     2P%
17    eFG%
18      FT
19     FTA
20     FT%
21     ORB
22     DRB
23     TRB
24     AST
25     STL
26     BLK
27     TOV
28      PF
29     PTS


In [64]:
df.columns = ['Player','Pos','Age','Tm','G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS']

In [65]:
df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,0.357,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,0.222,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,0.345,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,0.595,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,0.576,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9


In [74]:
df.shape

(698, 29)

## Web Scrapping Using read_html

In [79]:
html_url = "https://www.basketball-reference.com/leagues/NBA_2019_per_game.html"

In [80]:
nba_tables = pd.read_html(html_url)

In [81]:
len(nba_tables)

1

In [82]:
nba_t = nba_tables[0]

In [83]:
nba_t

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,.700,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,.500,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,528,Tyler Zeller,C,29,MEM,4,1,20.5,4.0,7.0,...,.778,2.3,2.3,4.5,0.8,0.3,0.8,1.0,4.0,11.5
730,529,Ante Žižić,C,22,CLE,59,25,18.3,3.1,5.6,...,.705,1.8,3.6,5.4,0.9,0.2,0.4,1.0,1.9,7.8
731,530,Ivica Zubac,C,21,TOT,59,37,17.6,3.6,6.4,...,.802,1.9,4.2,6.1,1.1,0.2,0.9,1.2,2.3,8.9
732,530,Ivica Zubac,C,21,LAL,33,12,15.6,3.4,5.8,...,.864,1.6,3.3,4.9,0.8,0.1,0.8,1.0,2.2,8.5


In [84]:
nba_t.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
