## Asynchronous scraping

In [3]:
!pip install aiohttp -q
!pip install asyncio -q
!pip install nest-asyncio -q

In [4]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import csv
import re

In [5]:
import nest_asyncio
nest_asyncio.apply()

async means that the function can be paused and resumed at any time. This is useful when we are waiting for a response from a server. We can pause the function and do other things while waiting for the response. When the response is ready, we can resume the function.

It works in parallel with other functions. This means that we can run multiple functions at the same time. This is useful when we are scraping multiple pages. We can scrape multiple pages at the same time, which makes the scraping process faster.

In [None]:
async def scrap_and_save_links(text):
    """
    Process the HTML text and save the links to a CSV file.

    Args:
        text (str): The HTML content to process.
    """
    soup = BeautifulSoup(text, 'html.parser') # parse the html
    file = open('csv_file', 'a', newline='') # open the file in append mode
    writer= csv.writer(file, delimiter=',') # create a csv writer object
    for link in soup.findAll('a', attrs={'href': re.compile('^http')}): # find all the links that start with http
        link = link.get('href') # get the link
        writer.writerow([link]) # write the link to the csv file
    file.close() # close the file

In [None]:
async def fetch(session, url): # function to fetch the url
   """
   Fetch the content of the URL and process it.

   Args:
       session (aiohttp.ClientSession): The aiohttp client session.
       url (str): The URL to fetch.
   """
   try: # try to fetch the url
      async with session.get(url) as response: # get the response from the url without blocking the event loop or other processes
         text= await response.text() # get the text from the response by using await for waiting for the response
         task = asyncio.create_task(scrap_and_save_links(text)) # create a task to scrap and save the links
         await task # wait for the task (scrap_and_save_links) to complete
   except Exception as e: # if there is an exception
      print(str(e)) # print the exception

In [None]:
async def scrap(urls): # function to scrap the urls
  """
  Scrape the provided URLs.

  Args:
      urls (list): A list of URLs to scrape.
  """
  tasks = []
  async with aiohttp.ClientSession() as session: # create a client session for whole HTTP request
    for url in urls: # for each url in the list of urls
      tasks.append(fetch(session,url)) # append the task to fetch the url
    await asyncio.gather(*tasks) # gather all the tasks - gather is used to wait for all the tasks to complete in parallel

In [9]:
urls = ['https://analytics.usa.gov/', 'https://www.python.org/', 'https://www.linkedin.com/']
asyncio.run(scrap(urls=urls))