In [1]:
urls="""https://regex101.com/
https://docs.python.org/3/this-url-will-404.html
https://www.nytimes.com/guides/
https://www.mediamatters.org/
https://1.1.1.1/
https://www.politico.com/tipsheets/morning-money
https://www.bloomberg.com/markets/economics
https://www.ietf.org/rfc/rfc2616.txt""".split()
import asyncio
import logging
import re
import sys
from typing import IO
import urllib.error
import urllib.parse

# import aiofiles
import aiohttp
from aiohttp import ClientSession
logger = logging.getLogger()

In [2]:
session = ClientSession()

In [3]:
HREF_RE = re.compile(r'href="(.*?)"')

In [4]:
def log(*x):
    logger.info(*x)

async def fetch_html(url, session):
    resp = await session.request(method='GET', url=url)
    resp.raise_for_status()
    log(f'url response: {resp.status}')
    html = await resp.text()
    return html
    
async def parse(url, session):
    found = set()
    try:
        html = await fetch_html(url, session)
    except (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError) as e:
        log(f'{url} exception: {e}')
        return found
    except Exception as e:
        log(f'{url} non aio exception: {e}')
        return found
    else:
        for link in HREF_RE.findall(html):
            try:
                abslink = urllib.parse.urljoin(url, link)
            except (urllib.error.URLError, ValueError):
                log(f"Error parsing URL: {link}")
                pass
            else:
                found.add(abslink)
        log(f"Found {len(found)} links for {url}")
        return found

# from asyn import partial
# tasks = [asyncio.create_task(parse(url, session)) for url in urls]


In [10]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
loop = asyncio.get_running_loop()
# loop.run_forever()

results = []
for url in urls:
    r = await loop.run_in_executor(None, parse, url, session)
    results.append(r)
    

In [11]:
await asyncio.gather(*results)

[{'http://browsehappy.com/',
  'http://whatismarkdown.com/',
  'https://discord.gg/wUA6F6YqSs',
  'https://fonts.googleapis.com',
  'https://fonts.googleapis.com/css2?family=Open+Sans:ital,wght@0,300;0,400;0,600;0,700;1,400&family=Source+Code+Pro:wght@400;500;700&display=swap',
  'https://fonts.gstatic.com',
  'https://github.com/firasdib/Regex101/issues',
  'https://github.com/firasdib/Regex101/wiki',
  'https://github.com/sponsors/firasdib',
  'https://regex101.com',
  'https://regex101.com/',
  'https://regex101.com/account/mine',
  'https://regex101.com/codegen?language=php',
  'https://regex101.com/debugger',
  'https://regex101.com/library',
  'https://regex101.com/quiz',
  'https://regex101.com/settings',
  'https://regex101.com/static/653.8eb7aa021aa1417c05e6.css',
  'https://regex101.com/static/assets/10.2__iPad_landscape.png',
  'https://regex101.com/static/assets/10.2__iPad_portrait.png',
  'https://regex101.com/static/assets/10.5__iPad_Air_landscape.png',
  'https://regex10

In [33]:


loop.

In [28]:
z = await asyncio.gather(*tasks)

In [31]:
len(z), len(urls)

(8, 8)