# Selenium for web scraping - tutorial

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
# Open the browswer
driver = webdriver.Chrome()

# Open the website
driver.get("https://www.selenium.dev/selenium/web/web-form.html")

In [3]:
# Get the title
title = driver.title

# Waiting time needed for the website to load before raising an error
driver.implicitly_wait(0.5)

title

'Web form'

In [4]:
# Get the text input box using its name
text_box = driver.find_element(by=By.NAME, value="my-text")

# Get the submit button
submit_button = driver.find_element(by=By.CSS_SELECTOR, value="button")

# Type inside the text box
text_box.send_keys("Selenium")

# Submit
submit_button.click()

# Get the message that is returned
message = driver.find_element(by=By.ID, value="message")

text = message.text

text

driver.quit()

# Now let's try to scrape something

In [5]:
# Options are the settings you can give to the browser
from selenium.webdriver.chrome.options import Options

In [49]:
# Initialize Options
opts = Options()

# Opens the browers without showing it on your screen
opts.add_argument("--headless")

driver = webdriver.Chrome(options=opts)
driver.get("https://ground.news/")
driver.title

'Ground News'

In [53]:
# Save all the links in the navigation bar in a list
links = driver.find_elements(By.CSS_SELECTOR, ".embla__slide a")

if links:
    print(f"Found {len(links)} nav links:")
    for link in links:
         print(link.text, "→", link.get_attribute("href"))

Found 12 nav links:
Israel-Gaza → https://ground.news/interest/israeli-palestinian-conflict
Remembrance Day → https://ground.news/interest/remembrance-day
Artificial Intelligence → https://ground.news/interest/ai
Soccer → https://ground.news/interest/soccer
Basketball → https://ground.news/interest/basketball
Stock Markets → https://ground.news/interest/stock-markets
 → https://ground.news/interest/social-media
 → https://ground.news/interest/wrestling
 → https://ground.news/interest/government-shutdown
 → https://ground.news/interest/donald-trump
 → https://ground.news/interest/volleyball
 → https://ground.news/interest/united-states-economy


That is pure html, so we could have handled that using BeautifulSoup.

Now, let's do something BeautifulSoup can't.

In [60]:
topic = driver.find_element(By.ID, "header-trending-Artificial Intelligence")
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", topic)
# or topic.click()
#driver.quit()

In [61]:
articles = driver.find_elements(By.CLASS_NAME, "group")

In [65]:
all_links = []

for article in articles:
    links = article.find_elements(By.TAG_NAME, "a")
    for link in links:
        href = link.get_attribute("href")
        if href:
            all_links.append(href)

print(all_links)

['https://ground.news/article/nvidias-jensen-huang-china-is-going-to-win-the-ai-race-ft-reports', 'https://ground.news/article/nvidias-jensen-huang-china-is-going-to-win-the-ai-race-ft-reports', 'https://ground.news/article/nvidias-jensen-huang-china-is-going-to-win-the-ai-race-ft-reports', 'https://ground.news/article/global-tech-tensions-overshadow-web-summits-ai-and-robots', 'https://ground.news/article/global-tech-tensions-overshadow-web-summits-ai-and-robots', 'https://ground.news/article/global-tech-tensions-overshadow-web-summits-ai-and-robots', 'https://ground.news/article/supreme-court-issues-notice-to-centre-on-air-india-pilots-fathers-plea-for-judicial-inquiry-none-of-142-cr-people', 'https://ground.news/article/supreme-court-issues-notice-to-centre-on-air-india-pilots-fathers-plea-for-judicial-inquiry-none-of-142-cr-people', 'https://ground.news/article/supreme-court-issues-notice-to-centre-on-air-india-pilots-fathers-plea-for-judicial-inquiry-none-of-142-cr-people', 'https

In [70]:
import re

titles = []
for link in all_links:
    match = re.search(r'/article/([^/]+)$', link)
    if match:
        titles.append(re.sub(r'-', ' ', match.group(1)))

print(titles)

['nvidias jensen huang china is going to win the ai race ft reports', 'nvidias jensen huang china is going to win the ai race ft reports', 'nvidias jensen huang china is going to win the ai race ft reports', 'global tech tensions overshadow web summits ai and robots', 'global tech tensions overshadow web summits ai and robots', 'global tech tensions overshadow web summits ai and robots', 'supreme court issues notice to centre on air india pilots fathers plea for judicial inquiry none of 142 cr people', 'supreme court issues notice to centre on air india pilots fathers plea for judicial inquiry none of 142 cr people', 'supreme court issues notice to centre on air india pilots fathers plea for judicial inquiry none of 142 cr people', 'google planning powerful ai data centre on tiny australian indian ocean outpost', 'hardin county republican leader issues apology after posting video depicting obamas as apes', 'uks rightmove stock tumbles over 28 as ai investments expected to weigh on 2026