In [1]:
# web scrapping is a process to extract data directly form a website
# requests and beautiful soup modules are used

In [24]:
from bs4 import BeautifulSoup
import requests

In [25]:
html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

In [26]:
soup = BeautifulSoup(html)

In [27]:
# method prettify() to display the HTML in the nested structure:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>



In [28]:
tag_object=soup.title
print("tag object:",tag_object)

tag object: <title>Page Title</title>


In [30]:
# If there is more than one Tag with the same name, the first element with that Tag name is called
tag_object=soup.body
tag_object

<body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body>

In [31]:
#  We can access the child of the tag or navigate down the branch as follows:
tag_child =tag_object.h3
tag_child

<h3><b id="boldest">Lebron James</b></h3>

In [32]:
parent_tag=tag_child.parent
parent_tag

<body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body>

In [33]:
sibling_1=tag_child.next_sibling
print(sibling_1)

<p> Salary: $ 92,000,000 </p>


In [34]:
sibling_2= sibling_1.next_sibling
print(sibling_2)

<h3> Stephen Curry</h3>


In [37]:
#If the tag has attributes, the tag id="boldest" has an attribute id whose value is boldest. You can access a tag's attributes by treating the tag like a dictionary:
tag_child = tag_child.b
tag_child['id']

'boldest'

In [38]:
tag_child.attrs

{'id': 'boldest'}

In [39]:
tag_child.get('id')

'boldest'

In [40]:
tag_string=tag_child.string
tag_string

'Lebron James'

In [23]:
# The find_all() method looks through a tag's descendants and retrieves all descendants that match your filters.
# The Method signature for find_all(name, attrs, recursive, string, limit, **kwargs)

In [41]:
table="<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [42]:
table_bs = BeautifulSoup(table)

In [43]:
table_rows=table_bs.find_all('tr')
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr>]

In [44]:
table_rows[0].td

<td id="flight">Flight No</td>

In [45]:
list_input=table_bs .find_all(name=["tr", "td"])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td>,
 <td>80 kg</td>]

In [46]:
table_bs.find_all(id="flight")

[<td id="flight">Flight No</td>]

In [47]:
list_input=table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a>]

In [50]:
table_bs.find_all(href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a>]

In [51]:
# The find_all() method scans the entire document looking for results. It’s useful if you are looking for one element, as you can use the find() method to find the first element in the document.
table_bs.find("a")

<a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a>

In [53]:
# Downloading And Scraping The Contents Of A Web Page
url = "http://www.ibm.com"
data  = requests.get(url).text 

In [55]:
data[:100] 

'\n<!DOCTYPE HTML>\n<html lang="en-in">\n<head>\n    \n    \n    \n    \n    <meta charset="UTF-8"/>\n    <met'

In [2]:
# for extracting tabular data from a web page, you may also use the `read_html()` method of the Pandas library.
import pandas as pd
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"
tables = pd.read_html(url)
tables

[         0      1               2                 3                     4
 0   Number  Color      Color Name  Hex Code #RRGGBB  Decimal Code (R,G,B)
 1        1    NaN     lightsalmon           #FFA07A      rgb(255,160,122)
 2        2    NaN          salmon           #FA8072      rgb(250,128,114)
 3        3    NaN      darksalmon           #E9967A      rgb(233,150,122)
 4        4    NaN      lightcoral           #F08080      rgb(240,128,128)
 5        5    NaN           coral           #FF7F50       rgb(255,127,80)
 6        6    NaN          tomato           #FF6347        rgb(255,99,71)
 7        7    NaN       orangered           #FF4500         rgb(255,69,0)
 8        8    NaN            gold           #FFD700        rgb(255,215,0)
 9        9    NaN          orange           #FFA500        rgb(255,165,0)
 10      10    NaN      darkorange           #FF8C00        rgb(255,140,0)
 11      11    NaN     lightyellow           #FFFFE0      rgb(255,255,224)
 12      12    NaN    lem