# Understanding MNIST data using CDataloader from secml

In [None]:
# using CDataloader to use the dataset:
# For more details follow this site : https://secml.readthedocs.io/en/stable/secml.data.loader.html#module-secml.data.loader.c_dataloader_mnist
import secml
from secml.data.loader import CDataLoaderMNIST

### MNIST dataset consist of training set: 60,000 samples, test set: 10,000 samples 
### The images are 28X28 black nad white handwritten digits from 0-9.

In [None]:
loader= CDataLoaderMNIST()
# We are using this loader because we can get required digits from the dataset(can be seen in next cell, we only use digits 0,4)
loader

Downloading from `https://gitlab.com/api/v4/projects/secml%2Fsecml-zoo/repository/files/datasets%2FMNIST%2Ftrain-images-idx3-ubyte.gz/raw?ref=master` (9912422 bytes)

File stored in `/root/secml-data/datasets/mnist/train-images-idx3-ubyte.gz`
Downloading from `https://gitlab.com/api/v4/projects/secml%2Fsecml-zoo/repository/files/datasets%2FMNIST%2Ftrain-labels-idx1-ubyte.gz/raw?ref=master` (28881 bytes)

File stored in `/root/secml-data/datasets/mnist/train-labels-idx1-ubyte.gz`
Downloading from `https://gitlab.com/api/v4/projects/secml%2Fsecml-zoo/repository/files/datasets%2FMNIST%2Ft10k-images-idx3-ubyte.gz/raw?ref=master` (1648877 bytes)

File stored in `/root/secml-data/datasets/mnist/t10k-images-idx3-ubyte.gz`
Downloading from `https://gitlab.com/api/v4/projects/secml%2Fsecml-zoo/repository/files/datasets%2FMNIST%2Ft10k-labels-idx1-ubyte.gz/raw?ref=master` (4542 bytes)

File stored in `/root/secml-data/datasets/mnist/t10k-labels-idx1-ubyte.gz`


CDataLoaderMNIST{'train_data_path': '/root/secml-data/datasets/mnist/train-images-idx3-ubyte', 'train_labels_path': '/root/secml-data/datasets/mnist/train-labels-idx1-ubyte', 'test_data_path': '/root/secml-data/datasets/mnist/t10k-images-idx3-ubyte', 'test_labels_path': '/root/secml-data/datasets/mnist/t10k-labels-idx1-ubyte'}

In [None]:
num_training = 600  # Number of training set samples
num_validation = 200  # Number of validation set samples
num_testing = 200  # Number of testing set samples

digits_classifying=(0,4) # As we are only considerate about misclassifying 4 as 0.

In [None]:
# We are loading training and validation together because loader takes only two attributes (training, testing).
training_validation=loader.load('training', digits=digits_classifying, num_samples=num_training + num_validation) 
# We will split the training_validation into training and validation
training_dataset = training_validation[:num_training, :]
validation_dataset = training_validation[num_training:, :]
# Testing dataset
testing_dataset=loader.load('testing',digits=digits_classifying, num_samples=num_testing)

In [None]:
training_dataset
# As the images are of size 28X28, if we flatten it into 1D array, we get an array of dimension 784, as we have 600 samples
# the dimension becomes (600,784)

CDataset{'X': CArray(600, 784)(dense: [[0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] ... [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.]]), 'Y': CArray(600,)(dense: [0 1 1 ... 1 0 1]), 'header': CDatasetHeader{'num_samples': None, 'img_w': 28, 'img_h': 28, 'y_original': (0, 4)}}

### Normalise the data between [0,1]
#### Normalisation helps in maintaining consistency in data. 
#### We are dividing each data by 255 because RGB ranges from [0,255], so dividing them with 255 will set these values between [0,1]

In [None]:
training_dataset.X /= 255
validation_dataset.X /= 255
testing_dataset.X /= 255