In [4]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import nest_asyncio

# Patch for notebook event loop
nest_asyncio.apply()


In [5]:
async def fetch_html(session, url):
    headers = {
        "User-Agent": "MyAsyncWikipediaScraper/0.1 (+https://twitter.com/enarrodata; eric.narro@example.com)",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://www.google.com/",  # pretend you came from somewhere
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    async with session.get(url, headers=headers) as response:
        if response.status == 200:
            return await response.text()
        else:
            print(f"Error fetching {url}: {response.status}")
            return None


async def parse_html(html):
    if html:
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("h1").text
        first_para = (
            soup.find("p").text.strip() if soup.find("p") else "No paragraph found"
        )
        return {"title": title, "first_para": first_para}
    return None


async def scrape_urls(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_html(session, url) for url in urls]
        htmls = await asyncio.gather(*tasks)
        parsed_results = await asyncio.gather(*(parse_html(html) for html in htmls))
        return parsed_results


In [None]:
urls = [
    "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "https://en.wikipedia.org/wiki/Web_scraping",
]

# Run the async function in notebook
results = await scrape_urls(urls)

for i, result in enumerate(results):
    if result:
        print(f"URL: {urls[i]}")
        print(f"Title: {result['title']}")
        print(f"First Paragraph: {result['first_para'][:200]}...")  # Truncate
        print("\n")

Error fetching https://en.wikipedia.org/wiki/Asyncio: 404
URL: https://en.wikipedia.org/wiki/Python_(programming_language)
Title: Python (programming language)
First Paragraph: ...


URL: https://en.wikipedia.org/wiki/Web_scraping
Title: Web scraping
First Paragraph: Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites.[1] Web scraping software may directly access the World Wide Web using the Hypertext Transf...


