In [None]:
from init_notebook import *
import base64

In [None]:
PATH = Path("~/prog/data/pixilart").expanduser()
os.makedirs(PATH, exist_ok=True)

In [None]:
session = requests.Session()
session.headers = {
    "user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0"
}

In [None]:
def scrape_page(index: int):
    url = f"https://www.pixilart.com/api/w/gallery/{index}/0/highlighted?user=true&liked=true"
    response = session.get(url)
    if response.status_code != 200:
        raise RuntimeError(response.text)

    for art in response.json()["art"]:
        filename = PATH / f'{art["unqid"]}.json'
        filename.write_text(json.dumps(art, indent=2))
        
        image_url = art["full_image_url"]
        filename = PATH / image_url.rsplit("/", 1)[-1]
        if not filename.exists():        
            print(image_url)
            response = session.get(image_url)
            filename.write_bytes(response.content)
            time.sleep(1)
        
# yeah, well, it does not work, thanks to cloudflare i guess         
#scrape_page(0)

# instead record a har file in the browser and extract images from there

In [None]:
def iter_images():
    entries = []
    for filename in (
            PATH / "pixilart-com-2024-02-25.har",
            PATH / "pixilart-com-2025-01-25.har",
            PATH / "pixilart-com-2025-01-25-02.har",
            PATH / "pixilart-com-2025-01-25-03.har",
            PATH / "pixilart-com-2025-01-25-04.har",
    ):
        with filename.open() as f:
            har_data = json.load(f)
            entries.extend(har_data["log"]["entries"])

    filename_set = set()
    num_duplicates = 0

    with tqdm(entries, desc=filename.name) as progress:
        for e in progress:
            if e.get("request") and e["request"]["url"].endswith(".png"):
                if e.get("response") and e["response"].get("content"):
                    content = e["response"]["content"]
                    #print(content["mimeType"])
                    if content["mimeType"] in ("image/png", "image/webp"):
                        if content.get("encoding") == "base64":
                            data = base64.b64decode(content["text"].encode("ascii"))
                            fn = e["request"]["url"].rsplit("/", 1)[-1]
                            if fn in filename_set:
                                num_duplicates += 1
                                progress.set_postfix({"duplicates": num_duplicates})
                                continue
                            filename_set.add(fn)
                            
                            try:
                                image = PIL.Image.open(io.BytesIO(data))
                            except Exception as ex:
                                print("FAILED", e["request"]["url"], ex)
                                continue
                            
                            yield image, fn
                        

for i, (image, filename) in zip(range(10), iter_images()):
    print(filename)
    display(image)
    

In [None]:
count = 0
for _ in iter_images():
    count += 1
print("COUNT", count)

In [None]:
count

## resize images to smallest scale

wow, this seems to be a harder problem... it's not really working good, so i keep the 400px previews as they are

In [None]:
import torchvision.transforms.functional as VF
import torch
import math

def resize(img, scale: float, mode: VF.InterpolationMode = VF.InterpolationMode.NEAREST):
    if isinstance(img, PIL.Image.Image):
        shape = (img.height, img.width)
    else:
        shape = img.shape[-2:]
    return VF.resize(img, [max(1, int(s * scale)) for s in shape], mode, antialias=False)

def iter_small_images():
    for image, filename in iter_images():
        image = VF.to_tensor(image.convert("RGB"))
        # display(VF.to_pil_image(resize(image, 1)))
    
        smallest_image = image
        smallest_error = None
        
        for ofs in range(2, 9):
            small = VF.resize(image, [s // ofs for s in image.shape[-2:]], VF.InterpolationMode.NEAREST, antialias=False)
            upscaled = VF.resize(small, image.shape[-2:], VF.InterpolationMode.BILINEAR, antialias=False)
            error = (image != upscaled).float().mean()
            
            # print("X", ofs, error)
            if error < 0.5:
                if smallest_error is None or error <= smallest_error or error < .4:
                    smallest_error = error
                    smallest_image = small
                    # print("smallest", ofs, smallest_error)
        #print(smallest_error)
        #display(VF.to_pil_image(smallest_image))
        yield smallest_image, filename, smallest_error


for i, (image, filename, e) in zip(range(10), iter_small_images()):
    display(VF.to_pil_image(image))

# store

In [None]:
os.makedirs(PATH / "raw" / "train", exist_ok=True)
os.makedirs(PATH / "raw" / "test", exist_ok=True)

for i, (image, filename) in enumerate(iter_images()):
    sub_path = "test" if i % 10 == 0 else "train"
    filename = PATH / "raw" / sub_path / filename
    if not filename.exists():
        try:
            image.convert("RGB").save(filename)    
        except Exception as e:
            print("EXC", e)

## patch dataset

In [None]:
from src.util.image import ImageFilter
from src.datasets import ImageFilterIterableDataset

class PixilartPatchDataset(BaseIterableDataset):
    def __init__(
        self,
        shape: Tuple[int, int, int] = (3, 64, 64),
        interpolation: VT.InterpolationMode = VT.InterpolationMode.BILINEAR,
        interleave_images: Optional[int] = 20,
        shuffle_images: bool = True,
        train: bool = True,
    ):
        self._ds_image = ImageFolderIterableDataset(
            Path("~/prog/data/pixilart/raw").expanduser() / ("train" if train else "test"),
            shuffle=shuffle_images,
        )
        
        self._ds = InterleaveIterableDataset(( 
            RandomImagePatchIterableDataset(
                self._ds_image.scale(min(shape[2:])/400, interpolation=interpolation), shape,
                patches_per_image_factor=1.,
                interleave_images=interleave_images,
            ),
            RandomImagePatchIterableDataset(
                self._ds_image.scale(.25, interpolation=interpolation), shape,
                patches_per_image_factor=2.,
                interleave_images=interleave_images,
            ),
            RandomImagePatchIterableDataset(
                self._ds_image.scale(.5, interpolation=interpolation), shape,
                patches_per_image_factor=3.,
                interleave_images=interleave_images,
            ),
            RandomImagePatchIterableDataset(
                self._ds_image, shape,
                interleave_images=interleave_images,
            ),
        ))
        self._ds = ImageFilterIterableDataset(
            self._ds,
            ImageFilter(
                min_std=0.1,
            )
        )

    def __iter__(self):
        yield from self._ds

ds = PixilartPatchDataset(shape=(3, 64, 64), interleave_images=20, train=True)#.shuffle(10_000)

VF.to_pil_image(make_grid(ds.sample(14*14), nrow=14))
#VF.to_pil_image(resize(make_grid(ds.sample(64), nrow=4), 3))


In [None]:
for _ in tqdm(ds):
    pass

In [None]:
target_shape = (3, 70, 70)
FILENAME = f"../datasets/pixilart-uint-{target_shape[-2]}x{target_shape[-1]}-train.pt"

def store_dataset(
        images: Iterable,
        dtype=torch.float32,
        #image_folder="~/Pictures/__diverse/",
        output_filename=FILENAME,
        max_megabyte=4_096,
):
    tensor_batch = []
    tensor_size = 0
    last_print_size = 0
    try:
        for image in tqdm(images):
            if len(image.shape) < 4:
                image = image.unsqueeze(0)
            tensor_batch.append((image.clamp(0, 1) * 255).to(torch.uint8))
            tensor_size += math.prod(image.shape)

            if tensor_size - last_print_size > 1024 * 1024 * 200:
                last_print_size = tensor_size

                print(f"size: {tensor_size:,}")

            if tensor_size >= max_megabyte * 1024 * 1024:
                break
    except KeyboardInterrupt:
        pass
    tensor_batch = torch.cat(tensor_batch)
    torch.save(tensor_batch, output_filename)

store_dataset(
    PixilartPatchDataset(shape=target_shape, interleave_images=20, train=True)
)

In [None]:
store_dataset(
    PixilartPatchDataset(shape=target_shape, interleave_images=20, train=False),
    output_filename=FILENAME.replace("-train", "-test"),
)