# Selenium for web scraping - tutorial

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [None]:
# Open the browswer
driver = webdriver.Chrome()

# Open the website
driver.get("https://www.selenium.dev/selenium/web/web-form.html")

In [None]:
# Get the title
title = driver.title

# Waiting time needed for the website to load before raising an error
driver.implicitly_wait(0.5)

title

In [None]:
# Get the text input box using its name
text_box = driver.find_element(by=By.NAME, value="my-text")

# Get the submit button
submit_button = driver.find_element(by=By.CSS_SELECTOR, value="button")

# Type inside the text box
text_box.send_keys("Selenium")

# Submit
submit_button.click()

# Get the message that is returned
message = driver.find_element(by=By.ID, value="message")

text = message.text

text

driver.quit()

# Now let's try to scrape something

In [None]:
# Options are the settings you can give to the browser
from selenium.webdriver.chrome.options import Options

In [None]:
# Initialize Options
opts = Options()

# Opens the browers without showing it on your screen
opts.add_argument("--headless")

driver = webdriver.Chrome(options=opts)
driver.get("https://ground.news/")
driver.title

In [None]:
# Save all the links in the navigation bar in a list
links = driver.find_elements(By.CSS_SELECTOR, ".embla__slide a")

if links:
    print(f"Found {len(links)} nav links:")
    for link in links:
         print(link.text, "â†’", link.get_attribute("href"))

That is pure html, so we could have handled that using BeautifulSoup.

Now, let's do something BeautifulSoup can't.

In [None]:
topic = driver.find_element(By.ID, "header-trending-Artificial Intelligence")
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", topic)
# or topic.click()
#driver.quit()

In [None]:
articles = driver.find_elements(By.CLASS_NAME, "group")

In [None]:
all_links = []

for article in articles:
    links = article.find_elements(By.TAG_NAME, "a")
    for link in links:
        href = link.get_attribute("href")
        if href:
            all_links.append(href)

print(all_links)

In [None]:
import re

titles = []
for link in all_links:
    match = re.search(r'/article/([^/]+)$', link)
    if match:
        titles.append(re.sub(r'-', ' ', match.group(1)))

print(titles)