<a href="https://colab.research.google.com/github/ecomunick/random/blob/main/scraping_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from bs4 import BeautifulSoup
import requests

# tutorial from here: https://scrapfly.io/blog/web-scraping-with-python-beautifulsoup/#how-is-html-parsed

In [2]:
html = """
<div class="product">
  <h2>Product Title</h2>
  <div class="price">
    <span class="discount">12.99</span>
    <span class="full">19.99</span>
  </div>
</div>
"""
from bs4 import BeautifulSoup

soup = BeautifulSoup(html)
product = {
    "title": soup.find(class_="product").find("h2").text,
    "full_price": soup.find(class_="product").find(class_="full").text,
    "price": soup.select_one(".price .discount").text,
}
print(product)
{
    "title": "Product Title",
    "full_price": "19.99",
    "price": "12.99",
}

{'title': 'Product Title', 'full_price': '19.99', 'price': '12.99'}


{'title': 'Product Title', 'full_price': '19.99', 'price': '12.99'}

In [3]:
# this is our HTML page:
html = """
<head>
  <title class="page-title">Hello World!</title>
</head>
<body>
  <div id="content">
    <h1>Title</h1>
    <p>first paragraph</p>
    <p>second paragraph</p>
    <h2>Subtitle</h2>
    <p>first paragraph of subtitle</p>
  </div>
</body>
"""

# 1. build soup object from html text
soup = BeautifulSoup(html, 'lxml')

# then we can navigate the html tree via python API:
# for example title is under `head` node:
print(soup.head.title)
'<title class="page-title">Hello World!</title>'

# this gives us a whole HTML node but we can also just select the text:
print(soup.head.title.text)
"Hello World!"

# or it's other attributes:
print(soup.head.title["class"])
"page-title"

<title class="page-title">Hello World!</title>
Hello World!
['page-title']


'page-title'

In [4]:
#soup.body.div.div.div.p.a['href']

if soup.body and soup.body.div and soup.body.div.div and soup.body.div.div.div and soup.body.div.div.div.p and soup.body.div.div.div.p.a:
    href = soup.body.div.div.div.p.a['href']
else:
    href = None  # or handle the case when the element doesn't exist


In [5]:
print(soup)  # Print the entire soup object to inspect the parsed HTML

# Print specific elements to check their existence and structure
if soup.body:
    print(soup.body)
    if soup.body.div:
        print(soup.body.div)
        if soup.body.div.div:
            print(soup.body.div.div)
            if soup.body.div.div.div:
                print(soup.body.div.div.div)
                if soup.body.div.div.div.p:
                    print(soup.body.div.div.div.p)
                    if soup.body.div.div.div.p.a:
                        print(soup.body.div.div.div.p.a['href'])
                    else:
                        print("The 'a' element is missing.")
                else:
                    print("The 'p' element is missing.")
            else:
                print("The third 'div' element is missing.")
        else:
            print("The second 'div' element is missing.")
    else:
        print("The first 'div' element is missing.")
else:
    print("The 'body' element is missing.")


<html><head>
<title class="page-title">Hello World!</title>
</head>
<body>
<div id="content">
<h1>Title</h1>
<p>first paragraph</p>
<p>second paragraph</p>
<h2>Subtitle</h2>
<p>first paragraph of subtitle</p>
</div>
</body>
</html>
<body>
<div id="content">
<h1>Title</h1>
<p>first paragraph</p>
<p>second paragraph</p>
<h2>Subtitle</h2>
<p>first paragraph of subtitle</p>
</div>
</body>
<div id="content">
<h1>Title</h1>
<p>first paragraph</p>
<p>second paragraph</p>
<h2>Subtitle</h2>
<p>first paragraph of subtitle</p>
</div>
The second 'div' element is missing.


In [6]:
import re
from bs4 import BeautifulSoup

html = """
<head>
  <title class="page-title">Hello World!</title>
</head>
<body>
  <div id="content">
    <h1>Title</h1>
    <p>first paragraph</p>
    <p>second paragraph</p>
    <h2>Subtitle</h2>
    <p>first paragraph of subtitle</p>
  </div>
</body>
"""

soup = BeautifulSoup(html, 'lxml')

print(soup.find('title').text)
# Output: Hello World!

print(soup.find(class_='page-title').text)
# Output: Hello World!

print(soup.find('div', id='content').h2.text)
# Output: Subtitle

print(soup.find_all('p', text=re.compile('first')))
# Output: ["<p>first paragraph</p>", "<p>first paragraph of subtitle</p>"]


Hello World!
Hello World!
Subtitle
[<p>first paragraph</p>, <p>first paragraph of subtitle</p>]


  print(soup.find_all('p', text=re.compile('first')))


In [7]:
from bs4 import BeautifulSoup

html = """
<head>
  <title class="page-title">Hello World!</title>
</head>
<body>
  <div id="content">
    <h1>Title</h1>
    <p>first paragraph</p>
    <p>second paragraph</p>
    <h2>Subtitle</h2>
    <p>first paragraph of subtitle</p>
  </div>
</body>
"""
soup = BeautifulSoup(html, 'lxml')

soup.select_one('title').text
"Hello World"

# we can also perform searching by attribute values such as class names
soup.select_one('.page-title').text
"Hello World"

# We can also find _all_ amtching values:
for paragraph in soup.select('#content p'):
    print(paragraph.text)
"first paragraph"
"second paragraph"
"first paragraph of subtitile"

# We can also combine CSS selectors with find functions:
import re
# select node with id=content and then find all paragraphs with text "first" that are under it:
soup.select_one('#content').find_all('p', text=re.compile('first'))
["<p>first paragraph</p>", "<p>first paragraph of subtitle</p>"]

first paragraph
second paragraph
first paragraph of subtitle


  soup.select_one('#content').find_all('p', text=re.compile('first'))


['<p>first paragraph</p>', '<p>first paragraph of subtitle</p>']

In [8]:
from bs4 import BeautifulSoup

html = """
<div>
  <a>The Avangers: </a>
  <a>End Game</a>
  <p>is one of the most popular Marvel movies</p>
</div>
"""
soup = BeautifulSoup(html, 'lxml')
# join all text values with space, and strip leading/trailing whitespace:
soup.div.get_text(' ', strip=True)
'The Avangers: End Game is one of the most popular Marvel movies'

'The Avangers: End Game is one of the most popular Marvel movies'

In [9]:
from bs4 import BeautifulSoup

html = """
<div><h1>The Avangers: </h1><a>End Game</a><p>is one of the most popular Marvel movies</p></div>
"""
soup = BeautifulSoup(html)
soup.prettify()
"""
<html>
 <body>
  <div>
   <h1>
    The Avangers:
   </h1>
   <a>
    End Game
   </a>
   <p>
    is one of the most popular Marvel movies
   </p>
  </div>
 </body>
</html>
"""

'\n<html>\n <body>\n  <div>\n   <h1>\n    The Avangers:\n   </h1>\n   <a>\n    End Game\n   </a>\n   <p>\n    is one of the most popular Marvel movies\n   </p>\n  </div>\n </body>\n</html>\n'

In [10]:
from bs4 import BeautifulSoup, SoupStrainer
html = """
<head><title>hello world</title></head>
<body>
  <div>
      <a>Link 1</a>
      <a>Link 2</a>
      <div>
        <a>Link 3</a>
      /div>
  </div>
</body>
"""
link_strainer = SoupStrainer('a')
soup = BeautifulSoup(html, parse_only=link_strainer)
print(soup)
#<a>Link 1</a><a>Link 2</a><a>Link 3</a>

<a>Link 1</a><a>Link 2</a><a>Link 3</a>


In [11]:
from bs4 import BeautifulSoup
html = """
<div>
  <button class="flat-button red">Subscribe</button>
</div>
"""
soup = BeautifulSoup(html)
soup.div.button['class'] = "shiny-button blue"
soup.div.button.string = "Unsubscribe"
print(soup.prettify())
# <html>
#  <body>
#   <div>
#    <button class="shiny-button blue">
#     Unsubscribe
#    </button>
#   </div>
#  </body>
# </html>

<html>
 <body>
  <div>
   <button class="shiny-button blue">
    Unsubscribe
   </button>
  </div>
 </body>
</html>


In [12]:
!pip install scrapfly-sdk

Collecting scrapfly-sdk
  Downloading scrapfly_sdk-0.8.8-py3-none-any.whl (28 kB)
Collecting loguru>=0.5 (from scrapfly-sdk)
  Downloading loguru-0.7.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting backoff>=1.10.0 (from scrapfly-sdk)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: loguru, backoff, scrapfly-sdk
Successfully installed backoff-2.2.1 loguru-0.7.0 scrapfly-sdk-0.8.8


In [13]:
import re
import json
from urllib.parse import urljoin
from scrapfly import ScrapflyClient, ScrapeConfig, ScrapeApiResponse

url = "https://www.remotepython.com/jobs/"
client = ScrapflyClient(key="YOUR SCRAPFLY KEY")
result: ScrapeApiResponse = client.scrape(ScrapeConfig(
    url=url,
    # we can select specific country:
    country="US",
    # proxy type:
    proxy_pool="public_residential_pool",
    # we can also enable headless browser powered js rendering
    render_js=True,
))
# scrapfly SDK comes with beautifulsoup built-in:
job_listing_boxes = result.soup.find_all(class_="item")

results = []
for item in job_listing_boxes:
    parsed = {}
    if title := item.find("h3"):
        parsed["title"] = title.get_text(strip=True)
    if item_url := item.find("h3").a["href"]:
        parsed["url"] = urljoin(url, item_url)
    if company := item.find("h5").find("span", class_="color-black"):
        parsed["company"] = company.text
    if location := item.select_one("h5 .color-white-mute"):
        parsed["location"] = location.text
    if posted_on := item.find("span", class_="color-white-mute", text=re.compile("posted:", re.I)):
        parsed["posted_on"] = posted_on.text.split("Posted:")[-1].strip()
    results.append(parsed)

print(results)
[{
    "title": "Hiring Senior Python / DJANGO Developer",
    "url": "https://www.remotepython.com/jobs/3edf4109d642494d81719fc9fe8dd5d6/",
    "company": "Mathieu Holding sarl",
    "location": "Rennes, France",
    "posted_on": "Sept. 1, 2022"
  },
  ...
]

EncoderError: ignored