Beautiful Soup 4

In [1]:
!pip install beautifulsoup4



In [2]:
from bs4 import BeautifulSoup

In [3]:
html = """
        <!DOCTYPE html><html><head><title>Example HTML</title></head><body><h1>Hello, World!</h1><p>A simple HTML page for testing web scraping with BeautifulSoup.</p>
                <a class='link' href='www.miuul.com' target='blank' aria-label='Miuul (Opens Miuul Page)'>Click</a>
                <li>Outsider</li>
                <ul>
                    <li>Item 1</li>
                    <li>Item 2</li>
                </ul>
            </body>
            </html>

"""

In [4]:
soup = BeautifulSoup(html, 'html.parser')

In [5]:
soup


<!DOCTYPE html>
<html><head><title>Example HTML</title></head><body><h1>Hello, World!</h1><p>A simple HTML page for testing web scraping with BeautifulSoup.</p>
<a aria-label="Miuul (Opens Miuul Page)" class="link" href="www.miuul.com" target="blank">Click</a>
<li>Outsider</li>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</body>
</html>

In [6]:
soup.title

<title>Example HTML</title>

In [7]:
title = soup.title

In [8]:
type(title)

In [9]:
title.text

'Example HTML'

In [10]:
title.string

'Example HTML'

In [11]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Example HTML
  </title>
 </head>
 <body>
  <h1>
   Hello, World!
  </h1>
  <p>
   A simple HTML page for testing web scraping with BeautifulSoup.
  </p>
  <a aria-label="Miuul (Opens Miuul Page)" class="link" href="www.miuul.com" target="blank">
   Click
  </a>
  <li>
   Outsider
  </li>
  <ul>
   <li>
    Item 1
   </li>
   <li>
    Item 2
   </li>
  </ul>
 </body>
</html>



In [14]:
soup.li

<li>Outsider</li>

In [13]:
soup.ul

<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>

In [15]:
soup.a

<a aria-label="Miuul (Opens Miuul Page)" class="link" href="www.miuul.com" target="blank">Click</a>

In [16]:
soup.find('a')

<a aria-label="Miuul (Opens Miuul Page)" class="link" href="www.miuul.com" target="blank">Click</a>

In [17]:
soup.find("a",attrs={"class":"link","target":"blank"})

<a aria-label="Miuul (Opens Miuul Page)" class="link" href="www.miuul.com" target="blank">Click</a>

In [18]:
li_elements = soup.find_all('li',attrs={"class":"list-item"})

In [19]:
li_elements

[]

In [20]:
li_elements[-1]

IndexError: list index out of range

In [21]:
import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.example.com/")
soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Example Domain
  </title>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <style type="text/css">
   body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
  </style>
 </head>
 <body>
  <div>
   <h1>
    Example Domain
   </h1>
   <p>
    This dom

In [22]:
response.status_code

200

In [23]:
response.content

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    

In [24]:
html = response.content

In [25]:
soup.find("h1").text


'Example Domain'

Selenium

In [26]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m27.

In [27]:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://www.example.com/")

SessionNotCreatedException: Message: session not created: Chrome failed to start: exited normally.
  (session not created: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /root/.cache/selenium/chrome/linux64/129.0.6668.89/chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x5a57b569402a <unknown>
#1 0x5a57b537a5e0 <unknown>
#2 0x5a57b53b2921 <unknown>
#3 0x5a57b53ae2c5 <unknown>
#4 0x5a57b53fadf6 <unknown>
#5 0x5a57b53fa446 <unknown>
#6 0x5a57b53ee8c3 <unknown>
#7 0x5a57b53bc6b3 <unknown>
#8 0x5a57b53bd68e <unknown>
#9 0x5a57b565ea2b <unknown>
#10 0x5a57b56629b1 <unknown>
#11 0x5a57b564b225 <unknown>
#12 0x5a57b5663532 <unknown>
#13 0x5a57b563038f <unknown>
#14 0x5a57b5682f28 <unknown>
#15 0x5a57b56830f3 <unknown>
#16 0x5a57b5692e7c <unknown>
#17 0x110498e94ac3 <unknown>


In [29]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
from time import sleep
from datetime import datetime

options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--diable-dve-shm-uage')

In [41]:
browser = webdriver.Chrome(options=options)
browser.get("https://www.example.com/")

In [42]:
browser.title

'Example Domain'

In [43]:
browser.current_url

'https://www.example.com/'

In [33]:
#browser.quit()

In [44]:
element = browser.find_elements(By.XPATH,'(//a)')

In [45]:
element

[<selenium.webdriver.remote.webelement.WebElement (session="28c35f83a827b40efa4a02b207c4d941", element="f.F3B78632E8BB5AEE17CCBE6A0F15BFDD.d.CC50A4CAEF21A15458D67B87E2AA4700.e.5")>]

In [46]:
element.text

AttributeError: 'list' object has no attribute 'text'

In [40]:
element.get_attribute('innerText')

AttributeError: 'list' object has no attribute 'get_attribute'

In [47]:
element[0].text

'More information...'

In [48]:
browser.get("https://www.miuul.com")

btn_elements = browser.find_elements(By.XPATH, "//a[@id='login']")
btn = btn_elements[0]
btn.click()

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=129.0.6668.89)
Stacktrace:
#0 0x586c9a99502a <unknown>
#1 0x586c9a67b43d <unknown>
#2 0x586c9a6cbddb <unknown>
#3 0x586c9a6bfa98 <unknown>
#4 0x586c9a6efb22 <unknown>
#5 0x586c9a6bf478 <unknown>
#6 0x586c9a6efcee <unknown>
#7 0x586c9a70ed7d <unknown>
#8 0x586c9a6ef8c3 <unknown>
#9 0x586c9a6bd6b3 <unknown>
#10 0x586c9a6be68e <unknown>
#11 0x586c9a95fa2b <unknown>
#12 0x586c9a9639b1 <unknown>
#13 0x586c9a94c225 <unknown>
#14 0x586c9a964532 <unknown>
#15 0x586c9a93138f <unknown>
#16 0x586c9a983f28 <unknown>
#17 0x586c9a9840f3 <unknown>
#18 0x586c9a993e7c <unknown>
#19 0x107d1da87ac3 <unknown>
