<a href="https://colab.research.google.com/github/fleshgordo/scrapinghub/blob/main/005_crawl4ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Crawl4AI

Welcome! In this notebook, we'll explore [crawl4ai](https://github.com/unclecode/crawl4ai), a Python library designed to make web scraping easier and more efficient — especially when working with AI workflows.

🔍 What is crawl4ai?
crawl4ai is a tool that helps you:


*   Automatically fetch, extract, and clean web content

*   Work with structured and unstructured data from websites

*   Prepare data for machine learning and natural language processing

It simplifies the web crawling process by combining:

*   requests for HTTP access

*   BeautifulSoup for HTML parsing

*   Useful features like deduplication, rate limiting, and content extraction

In [None]:
!pip install -U crawl4ai

In [None]:
!crawl4ai-setup

In [None]:
!crawl4ai-doctor

In [None]:
import asyncio
from playwright.async_api import async_playwright

async def test_browser():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto('https://nzz.ch')
        print(f'Title: {await page.title()}')
        await browser.close()

await test_browser()

In [None]:
import asyncio
from playwright.async_api import async_playwright

async def test_browser():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto('https://aare.guru/')
        print(f'Title: {await page.title()}')
        print(page)
        await browser.close()

await test_browser()


In [None]:
# prompt: get the text from h2 where class is mainValue

import asyncio
from playwright.async_api import async_playwright

async def get_h2_text():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto('https://aare.guru/')
        h2_element = await page.query_selector('h2.mainValue')
        if h2_element:
            text = await h2_element.text_content()
            print(text)
        else:
            print("h2 element with class 'mainValue' not found.")
        await browser.close()

asyncio.run(get_h2_text())


In [None]:
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async with AsyncWebCrawler() as crawler:
  # The arun() method performs the crawl for a single URL
  # It returns a CrawlResult object
  result = await crawler.arun(url="https://aare.guru/",wait_for="css:.mainValue")

  if result and result.success:
      print("Crawl Successful!")
      print(result.html)
  else:
      print(f"Crawl Failed: {result.error_message}")

# Aare temperatur

In [29]:
# prompt: get h2 text from mainValue
from bs4 import BeautifulSoup

async with AsyncWebCrawler() as crawler:
  result = await crawler.arun(url="https://aare.guru/#bern",wait_for="css:.mainValue")
  if result and result.success:
    print("Scrapped with sucess")
    soup = BeautifulSoup(result.html, 'html.parser')
    h2_element = soup.select_one('h2.mainValue')
    #print(soup.find_all("h2"))
    if h2_element:
        print(f"aare temp is: {h2_element.text}")
    else:
        print("h2 element with class 'mainValue' not found.")

Scrapped with sucess
aare temp is: 13.9


# Wasser-schleuse

In [26]:
from crawl4ai import AsyncWebCrawler
from bs4 import BeautifulSoup

async with AsyncWebCrawler() as crawler:
  result = await crawler.arun(url="https://wasserstand-nordsee.bsh.de/buesum_schleuse")
  if result.success:
    #print(result.html)
    soup = BeautifulSoup(result.html, 'html.parser')

    table = soup.find('table') #Find the first table in the HTML

    if table:
      for td_tag in table.find_all('td', {'rowspan': '2'}):
        print(td_tag)
        next_sibling = td_tag.next_sibling
        if next_sibling:
          print(next_sibling.text.strip()) # Extract and print text from next sibling
        else:
          print("No table found in the HTML.")

<td rowspan="2">13.05.2025</td>
14:17
<td rowspan="2">14.05.2025</td>
02:33
