In [11]:
# Import aiohttp for making asynchronous HTTP requests
import aiohttp

# Import asyncio for writing asynchronous code using coroutines
import asyncio

# Import nest_asyncio to allow nested use of asyncio.run() in Jupyter notebooks
import nest_asyncio

# Import BeautifulSoup from the bs4 library for parsing HTML and XML documents
from bs4 import BeautifulSoup

# Import the csv module for reading and writing CSV files
import csv

# Import the re module for performing regular expression operations
import re

In [12]:
# Apply the nest_asyncio patch to allow nested use of asyncio event loops
# This is particularly useful in Jupyter notebooks where the event loop is already running
nest_asyncio.apply()

In [13]:
# Define an asynchronous function to scrape links and save them to a CSV file
async def scrap_and_save_links(text):
    # Parse the HTML content using BeautifulSoup with 'html.parser'
    soup = BeautifulSoup(text, 'html.parser')
    
    # Open a CSV file named 'csv_file.csv' in append mode to store the scraped links
    # Use 'newline=""' to avoid adding extra newlines on Windows
    file = open('csv_file.csv', 'a', newline='')
    
    # Create a CSV writer object with a comma as the delimiter
    writer = csv.writer(file, delimiter=',')
    
    # Iterate over all <a> tags with 'href' attributes starting with 'http'
    for link in soup.find_all('a', attrs={'href': re.compile("^http")}):
        # Get the 'href' attribute value of each link
        link = link.get('href')
        
        # Write the link to the CSV file as a new row
        writer.writerow([link])
    
    # Close the file after writing all links
    file.close()

In [15]:
# Define an asynchronous function to fetch an URL and process its content
async def fetch(session, url):
    try:
        # Use the aiohttp session to make an asynchronous GET request to the specified URL
        async with session.get(url) as response:
            # Await the response text, which contains the HTML content of the page
            text = await response.text()
            
            # Create an asynchronous task to scrape links from the HTML content and save them
            task = asyncio.create_task(scrap_and_save_links(text))
            
            # Await the completion of the scraping and saving task
            await task
    except Exception as e:
        # Print any exceptions that occur during the fetching or processing
        print(str(e))

In [16]:
# Define an asynchronous function to scrape multiple URLs concurrently
async def scrap(urls):
    # Initialize an empty list to hold tasks
    tasks = []
    
    # Create an aiohttp ClientSession for making HTTP requests
    async with aiohttp.ClientSession() as session:
        # Iterate over the provided list of URLs
        for url in urls:
            # Create a fetch task for each URL and append it to the tasks list
            tasks.append(fetch(session, url))
        
        # Use asyncio.gather to run all fetch tasks concurrently
        await asyncio.gather(*tasks)

In [17]:
# Define a list of URLs to be scraped
urls = ['https://open.gsa.gov/', 'https://www.python.org/', 'https://www.google.com/']

# Run the scrap function with the list of URLs using asyncio's run method
# This will initiate the asynchronous scraping process for all specified URLs
asyncio.run(scrap(urls=urls))