In [3]:
import zipfile
import os, io
import numpy as np
from glob import glob
import pickle
import matplotlib.pyplot as plt

def unpickle(file):
    """
        Read downsampled 64*64 ImageNet data. Refer to: https://patrykchrabaszcz.github.io/Imagenet32/
        Return (dict): keys: ['data', 'labels', 'mean'], 
                    One file with validation data (“val_data”) contains python dictionary with fields 'data' and 'labels' (There is no 'mean' field)
    """
    with open(file, 'rb') as fo:
        dict = pickle.load(fo)
    return dict

In [16]:
# Calculate mean value among all training data
for f in sorted(glob('/data/ljc/datasets/imagenet64/train' + '/*')):
    print(f)
    dic = unpickle(f)
    mean_val = dic['mean']
    break
mean_val /= 255.0

/data/ljc/datasets/imagenet64/train/train_data_batch_1


In [15]:
print(mean_val)

[118.45995643 119.56469453 120.35390702 ... 100.14106046  99.72706603
  99.14217975]


In [17]:
"""
    Read batch data and save each image to .npy files
"""

# read a line from the file and transform it into RGB image
def transform_to_rgb(data):
    red_channel = data[:4096].reshape(64, 64)
    green_channel = data[4096:8192].reshape(64, 64)
    blue_channel = data[8192:].reshape(64, 64)

    # merge channels
    rgb_image = np.dstack((red_channel, green_channel, blue_channel))
    return rgb_image
    
split = ['train', 'val']
sp = split[1]
folder = '/data/ljc/datasets/imagenet64/' + sp
out_folder = '/data/ljc/datasets/imagenet64/processed/' + sp
label_file = open(out_folder + '/labels.txt', 'w')

cnt = 0
for f in sorted(glob(folder + '/*')):
    print('Processing', f)
    
    dic = unpickle(f)
    
    for i in range(len(dic['data'])):
        data = dic['data'][i] / 255.0 - mean_val
        rgb_array = transform_to_rgb(data)       # scale to [0,1]. remove mean value, as suggested by https://patrykchrabaszcz.github.io/Imagenet32/
        np.save(f'{out_folder}/{cnt}.npy', rgb_array)
        
        label_file.write(str(dic['labels'][i]) + '\n')

        cnt += 1
        
label_file.close()

Processing /data/ljc/datasets/imagenet64/val/val_data


In [20]:
from datahelpers import *

train_dataset = ImageNet64(data_dir=config.imagenet_root + '/train')
val_dataset = ImageNet64(data_dir=config.imagenet_root + '/val')

In [30]:
train_dataset.image_files[87:187]

['87.npy',
 '88.npy',
 '89.npy',
 '90.npy',
 '91.npy',
 '92.npy',
 '93.npy',
 '94.npy',
 '95.npy',
 '96.npy',
 '97.npy',
 '98.npy',
 '99.npy',
 '100.npy',
 '101.npy',
 '102.npy',
 '103.npy',
 '104.npy',
 '105.npy',
 '106.npy',
 '107.npy',
 '108.npy',
 '109.npy',
 '110.npy',
 '111.npy',
 '112.npy',
 '113.npy',
 '114.npy',
 '115.npy',
 '116.npy',
 '117.npy',
 '118.npy',
 '119.npy',
 '120.npy',
 '121.npy',
 '122.npy',
 '123.npy',
 '124.npy',
 '125.npy',
 '126.npy',
 '127.npy',
 '128.npy',
 '129.npy',
 '130.npy',
 '131.npy',
 '132.npy',
 '133.npy',
 '134.npy',
 '135.npy',
 '136.npy',
 '137.npy',
 '138.npy',
 '139.npy',
 '140.npy',
 '141.npy',
 '142.npy',
 '143.npy',
 '144.npy',
 '145.npy',
 '146.npy',
 '147.npy',
 '148.npy',
 '149.npy',
 '150.npy',
 '151.npy',
 '152.npy',
 '153.npy',
 '154.npy',
 '155.npy',
 '156.npy',
 '157.npy',
 '158.npy',
 '159.npy',
 '160.npy',
 '161.npy',
 '162.npy',
 '163.npy',
 '164.npy',
 '165.npy',
 '166.npy',
 '167.npy',
 '168.npy',
 '169.npy',
 '170.npy',
 '171