In [None]:
import os
import zipfile
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import random

In [24]:
def download_images_to_zip(url, zip_name, min_resolution=(100, 100), max_resolution=(2000, 2000)):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    img_urls = set()

    for img_tag in soup.find_all(['img', 'source']):
        src = img_tag.get('src') or img_tag.get('data-src') or img_tag.get('srcset')
        if src:
            img_urls.add(src)

    img_urls = [img if img.startswith('http') else f"{url}/{img}" for img in img_urls]

    total_downloaded = 0
    with zipfile.ZipFile(zip_name, 'w') as zipf:
        for i, img_url in enumerate(img_urls):
            try:
                img_response = requests.get(img_url)

                img = Image.open(BytesIO(img_response.content))

                if min_resolution[0] <= img.size[0] <= max_resolution[0] and \
                   min_resolution[1] <= img.size[1] <= max_resolution[1]:

                    img_format = 'JPEG' if img.mode == 'RGB' else 'PNG'
                    img_name = f"image_{i}.{img_format.lower()}"

                    with BytesIO() as img_bytes:
                        img.save(img_bytes, format=img_format)
                        zipf.writestr(img_name, img_bytes.getvalue())
                    total_downloaded += 1
            except Exception as e:
                print(f"Ошибка при загрузке {img_url}: {e}")

    print(f"Всего скачано изображений: {total_downloaded}")

In [25]:
def create_yolo_dataset_from_zip(zip_path, dataset_path, split_ratio=(0.8, 0.2)):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        img_files = zipf.namelist()
        random.shuffle(img_files)

        train_path = os.path.join(dataset_path, 'train/images')
        val_path = os.path.join(dataset_path, 'val/images')
        os.makedirs(train_path, exist_ok=True)
        os.makedirs(val_path, exist_ok=True)

        total_images = len(img_files)
        train_count = int(total_images * split_ratio[0])
        val_count = total_images - train_count

        for i, img_file in enumerate(img_files):
            target_dir = train_path if i < train_count else val_path
            with zipf.open(img_file) as img_src:
                with open(os.path.join(target_dir, img_file), 'wb') as img_dst:
                    img_dst.write(img_src.read())

        print(f"Количество изображений в тренировочном датасете: {train_count}")
        print(f"Количество изображений в валидационном датасете: {val_count}")

In [27]:
url = 'https://scryfall.com/search?q=(e%3Altr+cn%3E%3D452)+or+(e%3Altc+cn%3E%3D411)&order=set&as=grid&unique=prints'
zip_name = 'images.zip'
dataset_path = 'yolo_dataset'

download_images_to_zip(url, zip_name, min_resolution=(200, 200), max_resolution=(1000, 1000))

create_yolo_dataset_from_zip(zip_name, dataset_path, split_ratio=(0.8, 0.2))

Всего скачано изображений: 60
Количество изображений в тренировочном датасете: 48
Количество изображений в валидационном датасете: 12


In [None]:
import os
import shutil

def delete_dataset_and_zip(zip_path, dataset_path):
    if os.path.exists(zip_path):
        os.remove(zip_path)

    if os.path.exists(dataset_path):
        shutil.rmtree(dataset_path)

delete_dataset_and_zip(zip_name, dataset_path)