In [8]:
import os
import sys

import git.repo
import numpy as np
from array import array
import struct
from mlxtend.data import loadlocal_mnist
from ffcv.fields import IntField, NDArrayField
from ffcv.writer import DatasetWriter

GIT_ROOT = str(
    git.repo.Repo(".", search_parent_directories=True).working_tree_dir
)

In [7]:
class NumpyDataset:
    def __init__(self, xs: np.ndarray, ys: np.ndarray):
        assert xs.shape[0] == ys.shape[0]

        self.xs = xs
        self.ys = ys

    def __len__(self):
        return self.ys.shape[0]

    def __getitem__(self, idx: int):
        return (self.xs[idx], self.ys[idx])

In [7]:
path = os.path.join(GIT_ROOT, f"data/mnist8m/test10k-labels")
# for the labels
with open(path, "rb") as binary_file:
    y_train = np.array(array("B", binary_file.read()))

In [4]:
X_test, y_test = loadlocal_mnist(
    images_path='../../data/mnist8m/test10k-patterns', 
    labels_path='../../data/mnist8m/test10k-labels')

X_train, y_train = loadlocal_mnist(
    images_path='../../data/mnist8m/mnist8m-patterns-idx3-ubyte', 
    labels_path='../../data/mnist8m/mnist8m-labels-idx1-ubyte')

In [33]:
print('Dimensions: %s x %s' % (X_test.shape[0], X_test.shape[1]))
print(y_test.shape)
print('Dimensions: %s x %s' % (X_train.shape[0], X_train.shape[1]))
print(y_train.shape)
print(X_train[0].shape)
dt = X_train.dtype
print(np.dtype(np.uint8).itemsize)
print(dt)
print(dt.itemsize)

Dimensions: 10000 x 784
(10000,)
Dimensions: 8100000 x 784
(8100000,)
(784,)
1
uint8
1


In [9]:
datasets = {
    "train": NumpyDataset(X_train, y_train),
    "test" : NumpyDataset(X_test, y_test),
}

In [34]:
for (name, ds) in datasets.items():
    writer = DatasetWriter(
        os.path.join(GIT_ROOT, f"data/mnist8m/{name}.beton"),
        {
            "image": NDArrayField(np.dtype(np.uint8), (784,)),
            "label": IntField(),
        },
        num_workers=100,
    )
    writer.from_indexed_dataset(ds, chunksize=10000)

100%|██████████| 8100000/8100000 [00:13<00:00, 604937.05it/s]  
100%|██████████| 10000/10000 [00:00<00:00, 99720.97it/s]


In [24]:
print(np.uint8.itemsize)
print(np.prod((784,)))
# np.uint8.itemsize * np.prod((784,))

<attribute 'itemsize' of 'numpy.generic' objects>
784


In [17]:
path = os.path.join(GIT_ROOT, f"data/mnist8m/test10k-patterns")
# for the images
with open(path, "rb") as binary_file:
    images = []
    emnistRotate = True
    magic, size, rows, cols = struct.unpack(">IIII", binary_file.read(16))
    if magic != 2051:
        raise ValueError('Magic number mismatch, expected 2051,''got {}'.format(magic))
    for i in range(size):
        images.append([0] * rows * cols)
    image_data = array("B", binary_file.read())
    for i in range(size):
        images[i][:] = image_data[i * rows * cols:(i + 1) * rows * cols]

        # for some reason EMNIST is mirrored and rotated
        if emnistRotate:
            x = image_data[i * rows * cols:(i + 1) * rows * cols]

            subs = []
            for r in range(rows):
                subs.append(x[(rows - r) * cols - cols:(rows - r)*cols])

            l = list(zip(*reversed(subs)))
            fixed = [item for sublist in l for item in sublist]
            images[i][:] = fixed
x = []
for image in images:
    x.append(np.rot90(np.flip(np.array(image).reshape((28,28)), 1), 1))
x_train = np.array(x)

In [18]:
x_train.shape

(10000, 28, 28)