In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path

In [None]:
DATA_DIR = Path("data")

In [None]:
categories = [cat for cat in os.listdir(DATA_DIR) if (DATA_DIR/cat).is_dir()]
n_total = 0
for cat in categories:
    n_photos = len(os.listdir(DATA_DIR/cat))
    n_total += n_photos
    
for cat in categories:
    n_photos = len(os.listdir(DATA_DIR/cat))
    print(f"{cat}: {n_photos}, perc: {1.*n_photos/n_total:.3f}")

### Observations
* Inbalanced dataset
* Might be able to get good results with grayscale images, while also reducing train and inference time

In [None]:
# check if alpha channel is used
for cat in categories:
    for img_path in os.listdir(DATA_DIR/cat):
        assert np.array(Image.open(DATA_DIR/cat/img_path))[:,:,-1].mean() == 255

In [None]:
n_cols = 5
sample_size = 20
figsize = (15,10)

def plot_sample_photos(cat: str, sample_size: int, n_cols: int, figsize: tuple[int, int]) -> None:
    _, axes = plt.subplots(sample_size//n_cols, n_cols, figsize=figsize)
    cat_dir = DATA_DIR/cat
    cat_images = np.random.choice(os.listdir(cat_dir), size=sample_size)
    for idx, image_path in enumerate(cat_images):
        r, c = divmod(idx, n_cols)
        image = np.array(Image.open(cat_dir/image_path).convert('RGB'))

        axes[r][c].set_xticks([])
        axes[r][c].set_yticks([])
        axes[r][c].imshow(image)

In [None]:
plot_sample_photos('checked', sample_size, n_cols, figsize)

### Observations
* many different styles, incl colors, shape and text
* toggles are also included as checkboxes (can have a slightly different UI function)
    * data augmentations such as horizontal flips should not be used
* some of them have no actual "box", only a tick mark or a cross for example

In [None]:
plot_sample_photos('unchecked', sample_size, n_cols, figsize)

### Observations
* usually an empty box or circle
* some misclassified samples
* some samples contain a part of a checked check box
* for toggles relative position of moveable part matters most (left unchecked, right checked)

In [None]:
plot_sample_photos('other', sample_size, n_cols, figsize)

### Observations
* sliders can be confused as toggles
* positive or negative icons can be confused as ticked check boxes