In [None]:
import json
import numpy as np
from pathlib import Path
from tqdm import tqdm
from PIL import Image
np.random.seed(0)


In [None]:
json_path = Path(r'dataset\training.json')
input_path = Path(r'dataset\jpg\training_cropped')
output_path = Path(r'dataset\npz')

In [None]:
with open(json_path, 'r') as f:
    dataset = json.load(f)
    
for i in range(len(dataset['train'])):
    print('{:>2d}: train: {}, valid: {}'
          .format(i, len(dataset['train'][i]), len(dataset['valid'][i])))

In [None]:
train_images_lst, train_labels_lst = [], []
valid_images_lst, valid_labels_lst = [], []

for i in range(len(dataset['train'])):
    for f in dataset['train'][i]:
        train_images_lst.append(f)
        train_labels_lst.append(i)
    for f in dataset['valid'][i]:
        valid_images_lst.append(f)
        valid_labels_lst.append(i)
        
print('train_images: {}, train_labels: {}\nvalid_images: {}, valid_labels: {}'
      .format(len(train_images_lst), len(train_labels_lst), len(valid_images_lst), len(valid_labels_lst)))

In [None]:
train_data_idx = np.arange(len(train_images_lst))
valid_data_idx = np.arange(len(valid_images_lst))
np.random.shuffle(train_data_idx)
np.random.shuffle(valid_data_idx)

In [None]:
dataset = {'train': [train_images_lst[idx] for idx in train_data_idx],
           'valid': [valid_images_lst[idx] for idx in valid_data_idx]}

shuffled_json_path = Path(r'dataset\train_valid.json')
with open(shuffled_json_path, 'w') as f:
    json.dump(dataset, f)

In [None]:
print(input_path.name)

image_size = (480, 480)
train_images_arr = np.zeros((len(train_images_lst), image_size[1], image_size[0], 3), 'uint8')
train_labels_arr = np.zeros(len(train_labels_lst), 'int')
valid_images_arr = np.zeros((len(valid_images_lst), image_size[1], image_size[0], 3), 'uint8')
valid_labels_arr = np.zeros(len(valid_labels_lst), 'int')

for i, idx in enumerate(tqdm(train_data_idx)):
    image_path = Path(input_path, train_images_lst[idx])
    train_images_arr[i] = np.array(Image.open(image_path).resize(image_size, Image.Resampling.BICUBIC))
    train_labels_arr[i] = train_labels_lst[idx]
    
for i, idx in enumerate(tqdm(valid_data_idx)):
    image_path = Path(input_path, valid_images_lst[idx])
    valid_images_arr[i] = np.array(Image.open(image_path).resize(image_size, Image.Resampling.BICUBIC))
    valid_labels_arr[i] = valid_labels_lst[idx]

In [None]:
npz_file = output_path.joinpath('{}_{}x{}.npz'.format(input_path.name, image_size[0], image_size[1]))
np.savez(npz_file, train_images=train_images_arr, train_labels=train_labels_arr, 
         valid_images=valid_images_arr, valid_labels=valid_labels_arr)