In [None]:
import asyncio
from pathlib import Path

import aiofiles
import aiohttp
from aiohttp import ClientConnectorError
from aiohttp.client_exceptions import ClientPayloadError, ServerDisconnectedError
from tqdm.asyncio import tqdm_asyncio

In [2]:
loop = asyncio.get_event_loop()

In [3]:
class FaceResizer:
    def __init__(self, url_file, needed_identitites=50):
        self.url_file = url_file
        self.session = None
        self.needed_identitites = needed_identitites
        self.semaphore = asyncio.Semaphore()
        loop.run_until_complete(self.create_session())

    async def create_session(self):
        self.session = aiohttp.ClientSession()

    async def get_sorted_persons(self):
        person_dict = dict()
        async with aiofiles.open(self.url_file, encoding="utf-8") as f:
            async for line in f:
                person, imagenum, url, face_coords, md5sum = line.split('	')
                if person in person_dict:
                    person_dict[person].append((person, imagenum, url, face_coords))
                else:
                    person_dict[person] = [(person, imagenum, url, face_coords)]

        sorted_names = sorted(person_dict, key=lambda x: len(person_dict[x]), reverse=True)
        sorted_persons = []
        ct = 0
        for person in sorted_names:
            if ct == self.needed_identitites:
                break
            sorted_persons.append(person_dict[person])
            ct += 1
        return sorted_persons

    async def main(self, download_imgs=False):
        persons = await self.get_sorted_persons()
        if download_imgs:
            download_tasks = []
            for person in persons:
                for image in person:
                    name, number, url, face_coords = image
                    path = Path(f'output\\raw\\{person[0][0]}\\')
                    path.mkdir(parents=True, exist_ok=True)
                    download_tasks.append(self.download_image(url, f'{path}\\{number}.jpg'))

            for coro in tqdm_asyncio.as_completed(download_tasks, loop=loop):
                await coro
        return persons

    async def download_image(self, url, path):
        try:
            image = await self.session.get(url)
            if image.status != 200:
                tqdm_asyncio.write(f"Download failed: {image.status}")
                return
            async with aiofiles.open(path, mode='wb') as f:
                await f.write(await image.read())
        except (ClientConnectorError, ClientPayloadError, ServerDisconnectedError):
            pass