In [1]:
### Using Beautifulsoup for web scraping ####

In [3]:
#with sample html , demo
#once html is parsed, there are several ways to navigate
    #by tage name
    # using find() - returns one matching tag
    #using find_all() - returns a list of matching tag
#Also we can navigate using CSS selectors

In [14]:
from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <meta charset="UTF-8">
        <title>Fist HTML Page</title>
    </head>
    <body>
        <div id="first">
            <h3 data-example="yes">hi</h3>
            <p>more text</p>
        </div>
        <ol>
            <li class="special">This list items is special</li>
            <li class="special">This list items is also special</li>
            <li class="special">This list item is not so special</li>
        </ol>
        <div>bye</div>
    </body>
</html>
"""

soup = BeautifulSoup(html, "html.parser")
print(soup.body)

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>
<ol>
<li class="special">This list items is special</li>
<li class="special">This list items is also special</li>
<li class="special">This list item is not so special</li>
</ol>
<div>bye</div>
</body>


In [15]:
print(soup.body.div) # wewill only get the first div

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>


In [16]:
print(soup.find("div"))

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>


In [17]:
print(soup.find_all("div")) # A list of all the div elements

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>, <div>bye</div>]


In [18]:
print(soup.find_all("li"))

[<li class="special">This list items is special</li>, <li class="special">This list items is also special</li>, <li class="special">This list item is not so special</li>]


In [19]:
#selecting using attributes (id and class)
soup.find(id = "first")

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>

In [22]:
soup.find(class_ = "special")

<li class="special">This list items is special</li>

In [24]:
soup.find_all(class_ = "special")

[<li class="special">This list items is special</li>,
 <li class="special">This list items is also special</li>,
 <li class="special">This list item is not so special</li>]

In [25]:
#selecting data attribute
d = soup.find_all(attrs = {"data-example": "yes"})
print(d)

[<h3 data-example="yes">hi</h3>]


In [26]:
#### Selecting using CSS selectors #### alwyas gives the list of elements

In [27]:
soup.select("#first") #this is a list of items

[<div id="first">
 <h3 data-example="yes">hi</h3>
 <p>more text</p>
 </div>]

In [28]:
soup.select("#first")[0]

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>

In [29]:
soup.select("#first")

[<li class="special">This list items is special</li>,
 <li class="special">This list items is also special</li>,
 <li class="special">This list item is not so special</li>]

In [30]:
soup.select("div")

[<div id="first">
 <h3 data-example="yes">hi</h3>
 <p>more text</p>
 </div>, <div>bye</div>]

In [31]:
soup.select("[data-example]")

[<h3 data-example="yes">hi</h3>]

In [32]:
### Accessing data with beautiful soup ###

In [33]:
#get_text - access the inner text in an element
#name - tag name
#attrs - dictionary of attributes
# you can also access attribute values using brackets

In [38]:
from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <meta charset="UTF-8">
        <title>Fist HTML Page</title>
    </head>
    <body>
        <div id="first">
            <h3 data-example="yes">hi</h3>
            <p>more text</p>
        </div>
        <ol>
            <li class="special">This list items is special</li>
            <li class="special">This list items is also special</li>
            <li class="special">This list item is not so special</li>
        </ol>
        <div>bye</div>
    </body>
</html>
"""

soup = BeautifulSoup(html, "html.parser")

for el in soup.select(".special"):
    print(el.get_text()) # will return the inner text. 
    print(el.name) # weill print the tag name.
    print(el.attrs) # will return the attributes in each of the items
    print(el.attrs['class'])

#another way of getting the attribute
attr = soup.find("h3")["data-example"]
print(attr) #yes

This list items is special
li
{'class': ['special']}
['special']
This list items is also special
li
{'class': ['special']}
['special']
This list item is not so special
li
{'class': ['special']}
['special']
yes


In [None]:
##### Navigating with Beautiful Soup #####

In [None]:
"""
via Tags
* parent / parents
* contents
* next_sibling / next_siblings
* previous_sibling / previous_siblings
"""
"""
via Searching
* find_parent() / find_parents()
* find_next_sibling() / find_next_siblings()
* find_previous_sibling() / find_previous_siblings()
"""

In [46]:
from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <meta charset="UTF-8">
        <title>Fist HTML Page</title>
    </head>
    <body>
        <div id="first">
            <h3 data-example="yes">hi</h3>
            <p>more text</p>
        </div>
        <ol>
            <li class="special super-special">This list items is special</li>
            <li class="special">This list items is also special</li>
            <li>This list item is not so special</li>
        </ol>
        <div>bye</div>
    </body>
</html>
"""

soup = BeautifulSoup(html, "html.parser")

In [47]:
data = soup.body.contents
print(data) #all the body contents in a list with newlines /n

['\n', <div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>, '\n', <ol>
<li class="special super-special">This list items is special</li>
<li class="special">This list items is also special</li>
<li>This list item is not so special</li>
</ol>, '\n', <div>bye</div>, '\n']


In [48]:
data = soup.body.contents[1]
print(data)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>


In [49]:
#another level of digging
data = soup.body.contents[1].contents
print(data) # only the div contents

['\n', <h3 data-example="yes">hi</h3>, '\n', <p>more text</p>, '\n']


In [50]:
data = soup.body.contents[1].next_sibling.next_sibling
print(data)

<ol>
<li class="special super-special">This list items is special</li>
<li class="special">This list items is also special</li>
<li>This list item is not so special</li>
</ol>


In [53]:
data = soup.find(class_ = "super-special").parent
print(data)

<ol>
<li class="special super-special">This list items is special</li>
<li class="special">This list items is also special</li>
<li>This list item is not so special</li>
</ol>


In [54]:
#navigate via searching
data = soup.find(id="first").find_next_sibling() # this will skip the /n content in the returned list
print(data)

<ol>
<li class="special super-special">This list items is special</li>
<li class="special">This list items is also special</li>
<li>This list item is not so special</li>
</ol>


In [55]:
data = soup.find(id="first").find_next_sibling().find_next_sibling()
print(data)

<div>bye</div>


In [60]:
data = soup.select("[data-example]").find_previous_sibling()
print(data)

AttributeError: 'list' object has no attribute 'find_previous_sibling'

In [62]:
data = soup.find(id="first").find_next_sibling().find_previous_sibling()
print(data)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>


In [63]:
data = soup.find(class_="super-special").find_next_sibling(class_="special")
print(data)

<li class="special">This list items is also special</li>


In [65]:
data = soup.find("h3").find_parent()
print(data)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>


In [66]:
data = soup.find("h3").find_parent("body") #finding the next specific parent
print(data)

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text</p>
</div>
<ol>
<li class="special super-special">This list items is special</li>
<li class="special">This list items is also special</li>
<li>This list item is not so special</li>
</ol>
<div>bye</div>
</body>


In [1]:
################### BLANK #######################

In [2]:
# Scraping https://www.rithmschool.com/blog

In [16]:
import requests
from bs4 import BeautifulSoup
from csv import writer

response = requests.get("https://www.rithmschool.com/blog")

soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("article")

with open("blog_data.csv", "w", newline="") as csv_file:
    csv_writer = writer(csv_file)
    csv_writer.writerow(["title", "link", "date"])
    for article in articles:
        a_tag = article.find("a")
        title = a_tag.get_text()
        url = a_tag["href"]
        date = article.find("time")["datetime"]
        #print(title, url, date)
        csv_writer.writerow([title, url, date])
