In [1]:
from __future__ import annotations

import asyncio
import functools as ft

import aiohttp
import pandas as pd

from tqdm import tqdm
from aioretry import retry

In [2]:
urls = pd.read_csv("urls.csv", usecols=["url_host"]).squeeze().tolist()

In [3]:
@retry(lambda info: (info.fails >= 5, info.fails * 0.5))
async def parse_main_page_content(
    url: str,
    client: aiohttp.ClientSession, 
    dumpdir: str,
) -> None:
    async with client.get(f"https://{url}/") as resp:
        ok = resp.ok
        if ok:
            content = await resp.content.read()
    if ok:
        with open(f"{dumpdir}/{url}.html", "w") as f:
            print(content, file=f)

In [4]:
dumpdir = "parsed-only-200"
batch_size = 500
batch_sleep = 1.0

async with aiohttp.ClientSession() as client:
    parse_main_page_content_ = ft.partial(
        parse_main_page_content,
        client=client,
        dumpdir=dumpdir,
    )
    
    tasks = []
    for i, url in tqdm(enumerate(urls), total=len(urls)):
        tasks.append(asyncio.create_task(parse_main_page_content_(url)))
        
        if i > 0 and i % batch_size == 0:
            await asyncio.sleep(batch_sleep)

    res = await asyncio.gather(*tasks, return_exceptions=True)

100%|█████████████████████████████████| 199683/199683 [1:41:03<00:00, 32.93it/s]


In [9]:
! ls parsed | wc -l

11198


In [10]:
! ls parsed-only-200 | wc -l 

10072


In [5]:
! zip parsed-only-200.zip -r parsed-only-200/

  adding: parsed-only-200/ (stored 0%)
  adding: parsed-only-200/perevoz.bezformata.com.html (deflated 92%)
  adding: parsed-only-200/sverdlovskaya.flado.ru.html (deflated 86%)
  adding: parsed-only-200/bloknot-kamyshin.ru.html (deflated 87%)
  adding: parsed-only-200/xn----8sbjranccwkbcsfkf7e6ftbc.xn--p1ai.html (deflated 86%)
  adding: parsed-only-200/obuso-privolzhsk.ru.html (deflated 84%)
  adding: parsed-only-200/uluna-la.livejournal.com.html (deflated 79%)
  adding: parsed-only-200/prav-da-rub.livejournal.com.html (deflated 77%)
  adding: parsed-only-200/ukrs.tipsforeveryone.ru.html (deflated 83%)
  adding: parsed-only-200/spbfarfor.ru.html (deflated 88%)
  adding: parsed-only-200/prostroymat.ru.html (deflated 83%)
  adding: parsed-only-200/zametilprosto-livejournal-com.turbopages.org.html (deflated 54%)
  adding: parsed-only-200/thailand.move.ru.html (deflated 92%)
  adding: parsed-only-200/xn----8sbkbdcqc4a.xn--p1ai.html (deflated 86%)
  adding: parsed-only-200/t80k-livejournal-