In [1]:
# Web Scraping using Beautiful Soup 

In [2]:
# install Beautiful Soup 
%pip install bs4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# for web scraping 
from bs4 import BeautifulSoup 

# for downloading a web page 
import requests 

In [4]:
# Beautiful Soup Objects 
# Beautiful Soup can be used to extract data from HTML and XML files 
# the HTML is represented as a set of objects 
# objects have methods used to parse the HTML 
# the key is to navigate the HTML tree 

In [5]:
# Let's look at the following basic HTML showing top 3
# NBA salaries from the 2024-2025 Season 

In [6]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>NBA Salaries</title>
</head>
<body>
<h3> Stephen Curry</h3>
<p> Salary: $55,761,216 </p>
<h3> Joel Embiid </h3>
<p> Salary: $51,415,938</p>
<h3> Nikola Jokić </h3>
<p> Salary: $51,415,938</p>
</body>
</html>

In [7]:
# store HTML as a variable 

nba_html = "<!DOCTYPE html><html><head><title>NBA Salaries</title></head><body><h3> Stephen Curry</h3><p> Salary: $55,761,216 </p><h3> Joel Embiid </h3><p> Salary: $51,415,938</p><h3> Nikola Jokić </h3><p> Salary: $51,415,938</p></body></html>"
nba_html

'<!DOCTYPE html><html><head><title>NBA Salaries</title></head><body><h3> Stephen Curry</h3><p> Salary: $55,761,216 </p><h3> Joel Embiid </h3><p> Salary: $51,415,938</p><h3> Nikola Jokić </h3><p> Salary: $51,415,938</p></body></html>'

In [8]:
# Create a Beautiful Soup Object by passing nba_html into Beautiful Soup Constructor 
# Beautiful Soup transforms the HTML into a tree of python objects 

# html.parser is python's built in html parser 
nba_soup = BeautifulSoup(nba_html,"html.parser")

In [9]:
# prettify()
# Call function to display HTML in a nested structure. This helps us understand the nested structure
# because it's tabbed appropriately.
print(nba_soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   NBA Salaries
  </title>
 </head>
 <body>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $55,761,216
  </p>
  <h3>
   Joel Embiid
  </h3>
  <p>
   Salary: $51,415,938
  </p>
  <h3>
   Nikola Jokić
  </h3>
  <p>
   Salary: $51,415,938
  </p>
 </body>
</html>



In [10]:
# TAGS 
# Retrieve data based on HTML Tags 

# get the Title of the HTML document 
tag_title = nba_soup.title
print(tag_title)

<title>NBA Salaries</title>


In [11]:
# show object type of tag_title 
print(type(tag_title))

<class 'bs4.element.Tag'>


In [12]:
# used tags to retrieve the highest paid player <h3> 

tag_player = nba_soup.h3
print(tag_player)

# If there is more than one Tag with the same name, 
# the first element with that Tag name is called.
# This corresponds to the most paid player:

<h3> Stephen Curry</h3>


In [13]:
# get parent of player 
# since body is parent of h3, it will return entire body content 

tag_parent = tag_player.parent
print(tag_parent)

<body><h3> Stephen Curry</h3><p> Salary: $55,761,216 </p><h3> Joel Embiid </h3><p> Salary: $51,415,938</p><h3> Nikola Jokić </h3><p> Salary: $51,415,938</p></body>


In [14]:
# get siblings of player 
tag_sibling1 = tag_player.next_sibling
print(tag_sibling1)

tag_sibling2 = tag_sibling1.next_sibling
print(tag_sibling2)

<p> Salary: $55,761,216 </p>
<h3> Joel Embiid </h3>


In [15]:
# get salary of Joel Embiid 

embiid_salary = tag_sibling2.next_sibling
print(embiid_salary)

<p> Salary: $51,415,938</p>


In [12]:
# What if we just want the string inside the tag and not the tag itself? 
# Navigable Strings 
# A string corresponds to the content within the tag 
# Beautiful Soup uses the Navigable String Object to extract this text 

# get the string of the first player we found from h3
# tag_player = nba_soup.h3

player_nav_string = tag_player.string
print(player_nav_string)
print(type(player_nav_string))

 Stephen Curry
<class 'bs4.element.NavigableString'>


In [13]:
# we can convert this to a String 
player_string = str(player_nav_string)
print(type(player_string))

<class 'str'>


In [38]:
# Filtering - allows us to find complex patterns in HTML


In [39]:
%%html
<!DOCTYPE html>
<html lang="en">
<head>
  <title>NBA Player Salaries 2024–25</title> 
</head>
<body>
  <h1>NBA Player Salaries (2024–2025 Season)</h1>
  <table>
    <thead>
      <tr>

#THIS <TR> IS TABLE ROW

        <th>Player</th>
        <th>Team</th>
        <th>Position</th>
        <th>Salary</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>Stephen Curry</td>
        <td>Golden State Warriors</td>
        <td>G</td>
        <td>$55,761,216</td>
      </tr>
      <tr>
        <td>Joel Embiid</td>
        <td>Philadelphia 76ers</td>
        <td>C</td>
        <td>$51,415,938</td>
      </tr>
      <tr>
        <td>Nikola Jokić</td>
        <td>Denver Nuggets</td>
        <td>C</td>
        <td>$51,415,938</td>
      </tr>
      <tr>
        <td>Kevin Durant</td>
        <td>Phoenix Suns</td>
        <td>F</td>
        <td>$51,179,021</td>
      </tr>
      <tr>
        <td>Bradley Beal</td>
        <td>Phoenix Suns</td>
        <td>G</td>
        <td>$50,203,930</td>
      </tr>
      <tr>
        <td>Kawhi Leonard</td>
        <td>LA Clippers</td>
        <td>F</td>
        <td>$49,350,000</td>
      </tr>
      <tr>
        <td>Devin Booker</td>
        <td>Phoenix Suns</td>
        <td>G</td>
        <td>$49,205,800</td>
      </tr>
      <tr>
        <td>Paul George</td>
        <td>Philadelphia 76ers</td>
        <td>F</td>
        <td>$49,205,800</td>
      </tr>
      <tr>
        <td>Karl-Anthony Towns</td>
        <td>New York Knicks</td>
        <td>C</td>
        <td>$49,205,800</td>
      </tr>
      <tr>
        <td>Jaylen Brown</td>
        <td>Boston Celtics</td>
        <td>G</td>
        <td>$49,205,800</td>
      </tr>
    </tbody>
  </table>
</body>
</html>

Player,Team,Position,Salary
Stephen Curry,Golden State Warriors,G,"$55,761,216"
Joel Embiid,Philadelphia 76ers,C,"$51,415,938"
Nikola Jokić,Denver Nuggets,C,"$51,415,938"
Kevin Durant,Phoenix Suns,F,"$51,179,021"
Bradley Beal,Phoenix Suns,G,"$50,203,930"
Kawhi Leonard,LA Clippers,F,"$49,350,000"
Devin Booker,Phoenix Suns,G,"$49,205,800"
Paul George,Philadelphia 76ers,F,"$49,205,800"
Karl-Anthony Towns,New York Knicks,C,"$49,205,800"
Jaylen Brown,Boston Celtics,G,"$49,205,800"


In [14]:
nba_table_html = '<!DOCTYPE html><html lang="en"><head><title>NBA Player Salaries 2024–25</title></head><body><h1>NBA Player Salaries (2024–2025 Season)</h1><table><thead><tr><th>Player</th><th>Team</th><th>Position</th><th>Salary</th></tr></thead><tbody><tr><td>Stephen Curry</td><td>Golden State Warriors</td><td>G</td><td>$55,761,216</td></tr><tr><td>Joel Embiid</td><td>Philadelphia 76ers</td><td>C</td><td>$51,415,938</td></tr><tr><td>Nikola Jokić</td><td>Denver Nuggets</td><td>C</td><td>$51,415,938</td></tr><tr><td>Kevin Durant</td><td>Phoenix Suns</td><td>F</td><td>$51,179,021</td></tr><tr><td>Bradley Beal</td><td>Phoenix Suns</td><td>G</td><td>$50,203,930</td></tr><tr><td>Kawhi Leonard</td><td>LA Clippers</td><td>F</td><td>$49,350,000</td></tr><tr><td>Devin Booker</td><td>Phoenix Suns</td><td>G</td><td>$49,205,800</td></tr><tr><td>Paul George</td><td>Philadelphia 76ers</td><td>F</td><td>$49,205,800</td></tr><tr><td>Karl-Anthony Towns</td><td>New York Knicks</td><td>C</td><td>$49,205,800</td></tr><tr><td>Jaylen Brown</td><td>Boston Celtics</td><td>G</td><td>$49,205,800</td></tr></tbody></table></body></html>'

In [15]:
# create BS object 
nba_table = BeautifulSoup(nba_table_html,"html.parser")

In [16]:
# findall() 
# the findall() method searches through all the tag's descendants 
# and returns results based on filter 

In [17]:
# extract all table rows (tr)
table_rows=nba_table.find_all('tr')
table_rows

[<tr><th>Player</th><th>Team</th><th>Position</th><th>Salary</th></tr>,
 <tr><td>Stephen Curry</td><td>Golden State Warriors</td><td>G</td><td>$55,761,216</td></tr>,
 <tr><td>Joel Embiid</td><td>Philadelphia 76ers</td><td>C</td><td>$51,415,938</td></tr>,
 <tr><td>Nikola Jokić</td><td>Denver Nuggets</td><td>C</td><td>$51,415,938</td></tr>,
 <tr><td>Kevin Durant</td><td>Phoenix Suns</td><td>F</td><td>$51,179,021</td></tr>,
 <tr><td>Bradley Beal</td><td>Phoenix Suns</td><td>G</td><td>$50,203,930</td></tr>,
 <tr><td>Kawhi Leonard</td><td>LA Clippers</td><td>F</td><td>$49,350,000</td></tr>,
 <tr><td>Devin Booker</td><td>Phoenix Suns</td><td>G</td><td>$49,205,800</td></tr>,
 <tr><td>Paul George</td><td>Philadelphia 76ers</td><td>F</td><td>$49,205,800</td></tr>,
 <tr><td>Karl-Anthony Towns</td><td>New York Knicks</td><td>C</td><td>$49,205,800</td></tr>,
 <tr><td>Jaylen Brown</td><td>Boston Celtics</td><td>G</td><td>$49,205,800</td></tr>]

In [18]:
# the result is a Python Iterable just like a list, each element is a tag object
# display every row 
for row in table_rows:
    print(row)

<tr><th>Player</th><th>Team</th><th>Position</th><th>Salary</th></tr>
<tr><td>Stephen Curry</td><td>Golden State Warriors</td><td>G</td><td>$55,761,216</td></tr>
<tr><td>Joel Embiid</td><td>Philadelphia 76ers</td><td>C</td><td>$51,415,938</td></tr>
<tr><td>Nikola Jokić</td><td>Denver Nuggets</td><td>C</td><td>$51,415,938</td></tr>
<tr><td>Kevin Durant</td><td>Phoenix Suns</td><td>F</td><td>$51,179,021</td></tr>
<tr><td>Bradley Beal</td><td>Phoenix Suns</td><td>G</td><td>$50,203,930</td></tr>
<tr><td>Kawhi Leonard</td><td>LA Clippers</td><td>F</td><td>$49,350,000</td></tr>
<tr><td>Devin Booker</td><td>Phoenix Suns</td><td>G</td><td>$49,205,800</td></tr>
<tr><td>Paul George</td><td>Philadelphia 76ers</td><td>F</td><td>$49,205,800</td></tr>
<tr><td>Karl-Anthony Towns</td><td>New York Knicks</td><td>C</td><td>$49,205,800</td></tr>
<tr><td>Jaylen Brown</td><td>Boston Celtics</td><td>G</td><td>$49,205,800</td></tr>


In [19]:
# display first row 
first_row =table_rows[0]
first_row

<tr><th>Player</th><th>Team</th><th>Position</th><th>Salary</th></tr>

In [22]:
# return child of first row 
print(first_row.th)
# <th> IS TABLE HEADER

<th>Player</th>


In [21]:
# now we want to iterate while retaining the index
# use enumerate() 

for i,row in enumerate(table_rows):
    print("row",i,"is",row)

row 0 is <tr><th>Player</th><th>Team</th><th>Position</th><th>Salary</th></tr>
row 1 is <tr><td>Stephen Curry</td><td>Golden State Warriors</td><td>G</td><td>$55,761,216</td></tr>
row 2 is <tr><td>Joel Embiid</td><td>Philadelphia 76ers</td><td>C</td><td>$51,415,938</td></tr>
row 3 is <tr><td>Nikola Jokić</td><td>Denver Nuggets</td><td>C</td><td>$51,415,938</td></tr>
row 4 is <tr><td>Kevin Durant</td><td>Phoenix Suns</td><td>F</td><td>$51,179,021</td></tr>
row 5 is <tr><td>Bradley Beal</td><td>Phoenix Suns</td><td>G</td><td>$50,203,930</td></tr>
row 6 is <tr><td>Kawhi Leonard</td><td>LA Clippers</td><td>F</td><td>$49,350,000</td></tr>
row 7 is <tr><td>Devin Booker</td><td>Phoenix Suns</td><td>G</td><td>$49,205,800</td></tr>
row 8 is <tr><td>Paul George</td><td>Philadelphia 76ers</td><td>F</td><td>$49,205,800</td></tr>
row 9 is <tr><td>Karl-Anthony Towns</td><td>New York Knicks</td><td>C</td><td>$49,205,800</td></tr>
row 10 is <tr><td>Jaylen Brown</td><td>Boston Celtics</td><td>G</td><td

In [23]:
# output the columns per row of data excluding table headers 
for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
#td is a data cell (table data)
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

row 0
row 1
colunm 0 cell <td>Stephen Curry</td>
colunm 1 cell <td>Golden State Warriors</td>
colunm 2 cell <td>G</td>
colunm 3 cell <td>$55,761,216</td>
row 2
colunm 0 cell <td>Joel Embiid</td>
colunm 1 cell <td>Philadelphia 76ers</td>
colunm 2 cell <td>C</td>
colunm 3 cell <td>$51,415,938</td>
row 3
colunm 0 cell <td>Nikola Jokić</td>
colunm 1 cell <td>Denver Nuggets</td>
colunm 2 cell <td>C</td>
colunm 3 cell <td>$51,415,938</td>
row 4
colunm 0 cell <td>Kevin Durant</td>
colunm 1 cell <td>Phoenix Suns</td>
colunm 2 cell <td>F</td>
colunm 3 cell <td>$51,179,021</td>
row 5
colunm 0 cell <td>Bradley Beal</td>
colunm 1 cell <td>Phoenix Suns</td>
colunm 2 cell <td>G</td>
colunm 3 cell <td>$50,203,930</td>
row 6
colunm 0 cell <td>Kawhi Leonard</td>
colunm 1 cell <td>LA Clippers</td>
colunm 2 cell <td>F</td>
colunm 3 cell <td>$49,350,000</td>
row 7
colunm 0 cell <td>Devin Booker</td>
colunm 1 cell <td>Phoenix Suns</td>
colunm 2 cell <td>G</td>
colunm 3 cell <td>$49,205,800</td>
row 8
colun

In [24]:
# use findall() to create a list gathering all tr and td 

nba_list = nba_table.find_all(name=["tr", "td"])
nba_list

[<tr><th>Player</th><th>Team</th><th>Position</th><th>Salary</th></tr>,
 <tr><td>Stephen Curry</td><td>Golden State Warriors</td><td>G</td><td>$55,761,216</td></tr>,
 <td>Stephen Curry</td>,
 <td>Golden State Warriors</td>,
 <td>G</td>,
 <td>$55,761,216</td>,
 <tr><td>Joel Embiid</td><td>Philadelphia 76ers</td><td>C</td><td>$51,415,938</td></tr>,
 <td>Joel Embiid</td>,
 <td>Philadelphia 76ers</td>,
 <td>C</td>,
 <td>$51,415,938</td>,
 <tr><td>Nikola Jokić</td><td>Denver Nuggets</td><td>C</td><td>$51,415,938</td></tr>,
 <td>Nikola Jokić</td>,
 <td>Denver Nuggets</td>,
 <td>C</td>,
 <td>$51,415,938</td>,
 <tr><td>Kevin Durant</td><td>Phoenix Suns</td><td>F</td><td>$51,179,021</td></tr>,
 <td>Kevin Durant</td>,
 <td>Phoenix Suns</td>,
 <td>F</td>,
 <td>$51,179,021</td>,
 <tr><td>Bradley Beal</td><td>Phoenix Suns</td><td>G</td><td>$50,203,930</td></tr>,
 <td>Bradley Beal</td>,
 <td>Phoenix Suns</td>,
 <td>G</td>,
 <td>$50,203,930</td>,
 <tr><td>Kawhi Leonard</td><td>LA Clippers</td><td>F</

In [25]:
# search for strings instead of tags 
# find all occurrences of Phoenix Suns in table 
nba_table.find_all(string="Phoenix Suns")

['Phoenix Suns', 'Phoenix Suns', 'Phoenix Suns']

In [84]:
#####################################

In [16]:
# Downlaoding and Scraping Contents of a Web Page 
url = "https://www.nike.com"

r = requests.get(url)
r.status_code

200

In [17]:
# get data from request 
nike_data = requests.get(url).text

In [18]:
nike_data



In [19]:
nike_soup = BeautifulSoup(nike_data,"html.parser")  # create a soup object using the variable 'data'

In [20]:
# find all links on page 
# in html anchor/link is represented by the tag <a>

for link in nike_soup.find_all('a',href=True): 

    print(link.get('href'))

#skip-to-content
https://www.nike.com/accessibility#introduction
https://www.nike.com/jordan
https://www.nike.com/w/converse-akmjx
https://www.nike.com/retail
https://www.nike.com/help
https://www.nike.com/help
https://www.nike.com/orders/details/
https://www.nike.com/help/a/shipping-delivery
https://www.nike.com/help/a/returns-policy
https://www.nike.com/help/a/change-cancel-order
https://www.nike.com/help/a/size-charts
https://www.nike.com/help/#contact
https://www.nike.com/membership
https://www.nike.com/promo-code
https://www.nike.com/product-advice
#site-feedback
https://www.nike.com/membership
https://www.nike.com/register
https://www.nike.com
https://www.nike.com/w/black-friday-2phlm
https://www.nike.com/w/black-friday-2phlm
https://www.nike.com/w/black-friday-2phlm
https://www.nike.com/w/promotion-best-sellers-cmvh
https://www.nike.com/w/mens-black-friday-2phlmznik1
https://www.nike.com/w/mens-black-friday-shoes-2phlmznik1zy7ok
https://www.nike.com/w/mens-black-friday-clothing-

In [31]:
# scrape page for all images 
# in html image is represented by the tag <img>
for link in nike_soup.find_all('img'):
    print(link)

<img alt="Nike. Just Do It" class="_32IPZERI _3jm9Bm_E" data-image-loaded-class="guL_1FMX" data-landscape-url="https://static.nike.com/a/images/f_auto,cs_srgb/w_1920,c_limit/14798427-08c1-467b-8e68-f544edb3674d/nike-just-do-it.jpg" data-portrait-url="https://static.nike.com/a/images/f_auto,cs_srgb/w_1536,c_limit/7711f056-5038-415b-a4a7-0b5afc4f6ed6/nike-just-do-it.jpg" data-qa="image-media-img" src=""/>
<img alt="Nike. Just Do It" class="_32IPZERI _3jm9Bm_E" data-image-loaded-class="guL_1FMX" data-landscape-url="https://static.nike.com/a/images/f_auto,cs_srgb/w_1920,c_limit/14798427-08c1-467b-8e68-f544edb3674d/nike-just-do-it.jpg" data-portrait-url="https://static.nike.com/a/images/f_auto,cs_srgb/w_1536,c_limit/7711f056-5038-415b-a4a7-0b5afc4f6ed6/nike-just-do-it.jpg" data-qa="image-media-img" src=""/>
<img alt="Nike. Just Do It" class="_32IPZERI _3jm9Bm_E" data-image-loaded-class="guL_1FMX" data-landscape-url="https://static.nike.com/a/images/f_auto,cs_srgb/w_1920,c_limit/ab09ddc5-f35

In [122]:
##########################

In [32]:
# Scrape Data from HTML Tables 

# The below url contains an html table with data about colors and color codes.
colors_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [33]:
# get the contents of the webpage in text format and store in a variable called data
data = requests.get(colors_url).text

In [34]:
# create BS Object 
color_soup = BeautifulSoup(data,"html.parser")

In [35]:
# find an html table in the web page
# in html table is represented by the tag <table>

table = color_soup.find('table')
table

<table border="1" class="main-table">
<tr>
<td>Number </td>
<td>Color</td>
<td>Color Name</td>
<td>Hex Code<br/>#RRGGBB</td>
<td>Decimal Code<br/>(R,G,B)</td>
</tr>
<tr>
<td>1</td>
<td style="background:lightsalmon;"> </td>
<td>lightsalmon</td>
<td>#FFA07A</td>
<td>rgb(255,160,122)</td>
</tr>
<tr>
<td>2</td>
<td style="background:salmon;"> </td>
<td>salmon</td>
<td>#FA8072</td>
<td>rgb(250,128,114)</td>
</tr>
<tr>
<td>3</td>
<td style="background:darksalmon;"> </td>
<td>darksalmon</td>
<td>#E9967A</td>
<td>rgb(233,150,122)</td>
</tr>
<tr>
<td>4</td>
<td style="background:lightcoral;"> </td>
<td>lightcoral</td>
<td>#F08080</td>
<td>rgb(240,128,128)</td>
</tr>
<tr>
<td>5</td>
<td style="background:coral;"> </td>
<td>coral</td>
<td>#FF7F50</td>
<td>rgb(255,127,80)</td>
</tr>
<tr>
<td>6</td>
<td style="background:tomato;"> </td>
<td>tomato</td>
<td>#FF6347</td>
<td>rgb(255,99,71)</td>
</tr>
<tr>
<td>7</td>
<td style="background:orangered;"> </td>
<td>orangered</td>
<td>#FF4500</td>
<td>rgb

In [37]:
# Get all rows from the table
# in html table row is represented by the tag <tr>

for row in table.find_all('tr'): 
    # Get all columns in each row.
    # column is represented by the tag <td>
    cols = row.find_all('td') 
    
    # store the value in column 2 as color_name
    color_name = cols[2].string 

    # store the value in column 3 as color_code
    color_code = cols[3].string 

    # if None Type print "" 
    print("{:<15} == {:>10}".format(color_name or "" ,color_code or ""))

Color Name      ==           
lightsalmon     ==    #FFA07A
salmon          ==    #FA8072
darksalmon      ==    #E9967A
lightcoral      ==    #F08080
coral           ==    #FF7F50
tomato          ==    #FF6347
orangered       ==    #FF4500
gold            ==    #FFD700
orange          ==    #FFA500
darkorange      ==    #FF8C00
lightyellow     ==    #FFFFE0
lemonchiffon    ==    #FFFACD
papayawhip      ==    #FFEFD5
moccasin        ==    #FFE4B5
peachpuff       ==    #FFDAB9
palegoldenrod   ==    #EEE8AA
khaki           ==    #F0E68C
darkkhaki       ==    #BDB76B
yellow          ==    #FFFF00
lawngreen       ==    #7CFC00
chartreuse      ==    #7FFF00
limegreen       ==    #32CD32
lime            ==    #00FF00
forestgreen     ==    #228B22
green           ==    #008000
powderblue      ==    #B0E0E6
lightblue       ==    #ADD8E6
lightskyblue    ==    #87CEFA
skyblue         ==    #87CEEB
deepskyblue     ==    #00BFFF
lightsteelblue  ==    #B0C4DE
dodgerblue      ==    #1E90FF
