In [5]:
import pandas as pd

urls = pd.read_csv('data/urls.csv')
urls

Unnamed: 0,id,url
0,0,http://en.wikipedia.org/wiki/Bayer_filter
1,1,http://en.wikipedia.org/wiki/Starter_motor
2,2,http://en.wikipedia.org/wiki/Physical
3,3,http://en.wikipedia.org/wiki/Psychological
4,4,http://en.wikipedia.org/wiki/Withdrawal
...,...,...
5122971,5122971,http://en.wikipedia.org/wiki/Johnny_Depp
5122972,5122972,http://en.wikipedia.org/wiki/Peter_DeLuise
5122973,5122973,http://en.wikipedia.org/wiki/Holly_Robinson_Peete
5122974,5122974,https://zythophile.wordpress.com/tag/poems-abo...


In [None]:
from IPython.display import display, HTML
display(HTML(r.content.decode()))

In [6]:
import asyncio
from time import perf_counter
from typing import Any, Awaitable

In [14]:
import requests
from requests.exceptions import ConnectionError, TooManyRedirects

def get_html_sync(url: str) -> str:
    try:
        response = requests.get(url)
    except (ConnectionError, TooManyRedirects):
        return 'ConnectionError'
    
    content = response.content
    
    try:
        return content.decode()
    except UnicodeDecodeError:
        return 'UnicodeDecodeError'

get_html_sync('http://en.wikipedia.org/wiki/Bayer_filter')

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Bayer filter - Wikipedia</title>\n<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-

In [9]:
time_before = perf_counter()

for url in urls['url'][:10]:
    get_html_sync(url)
    
print(f"Total time (synchronous): {perf_counter() - time_before}")

Total time (synchronous): 3.467513041003258


In [10]:
async def http_get(url: str) -> str:
    return await asyncio.to_thread(get_html_sync, url)

In [None]:
def batched(iterable, max_batch_size: int):
    """ Batches an iterable into lists of given maximum size, yielding them one by one. """
    batch = []
    for element in iterable:
        batch.append(element)
        if len(batch) >= max_batch_size:
            yield batch
            batch = []
    if len(batch) > 0:
        yield batch

time_before = perf_counter()

for batch in batched(urls['url'][:100000], 1000):
    await asyncio.gather(*[http_get(url) for url in batch])
    
print(f"Total time (asynchronous): {perf_counter() - time_before}")