In [None]:
"""
In this notebook we will convert the MNIST dataset into Pixano Format

Note: For running, activate the pixano env
"""

### NOTE : Before running this notebook, set the value for the following variables

In [13]:
# The root dir name of the current repo (i.e. pixano or pixano-main etc.)
ROOTDIR='pixano_v1.0'
# name of the dataset
DATASET_NAME="MNIST_pixano_v1"
# directory where the raw mnist dataset will be saved to be transformed latter (images), and also to be used by the active learning auto-annotator (labels)
datasets_dir="/home/melissap/Desktop/LAGO/3.githubs/integration/datasets"
# directory in which the transformed mnist dataset will be saved to be used by Pixano
library_dir="/home/melissap/_pixano_datasets_"


## ... the rest of the notebook should run without any code adjustments/modifications.

In [14]:
import os
import sys
from pathlib import Path
import shutil
from PIL import Image
import numpy as np
import pandas as pd
from torchvision import datasets

In [15]:
def insertRootDir(ROOTDIR='pixano'):
    pardir=os.path.dirname(os.path.realpath('__file__'))

    found = False
    potential_root_dir = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath('__file__'))))))

    while(os.path.basename(pardir)!=ROOTDIR):

        # print(pardir)
        pardir=os.path.dirname(pardir)
        

        if (os.path.basename(pardir) == ROOTDIR):
            found = True
            break
        if (pardir == "/" ):
            break
    
    if found:
        print("Inserting parent dir : ",pardir)
        sys.path.insert(0,pardir)
        return pardir
    else:
        print(f"ROOTDIR NOT FOUND. You may have to change ROOTDIR variable from : '{ROOTDIR}' to '{potential_root_dir}'")
        return "_NOT_FOUND_"

ROOTDIR = insertRootDir(ROOTDIR)

Inserting parent dir :  /home/melissap/Desktop/LAGO_43integrationDemo/pixano_v1.0


In [16]:
from pixano.apps import Explorer
from pixano.data import ImageImporter, MnistImporter

/home/melissap/miniconda3/envs/pixano_env/lib/python3.10/site-packages/pixano/apps/explorer/dist/assets


In [19]:
def get_MNIST(data_dir):

    framecounter = 0 
    # downloads mnist and convert it to an image dataset

    image_dir = os.path.join(data_dir,"images")
    annotation_dir = os.path.join(data_dir,"annotations")
    train_imdir = os.path.join(image_dir,"train")
    val_imdir = os.path.join(image_dir,"val")
    test_imdir = os.path.join(image_dir,"test")

    raw_downloaDir = os.path.join(data_dir,"raw_dataset")

    train_anfile = os.path.join(annotation_dir,"train.csv")
    val_anfile = os.path.join(annotation_dir,"val.csv") # not used
    test_anfile = os.path.join(annotation_dir,"test.csv")
    
    if os.path.isdir(image_dir) and os.path.isdir(annotation_dir):
       pass
    else: 
        try:
            os.makedirs(image_dir)
            os.makedirs(annotation_dir)
            os.makedirs(train_imdir)
            os.makedirs(val_imdir)
            os.makedirs(test_imdir)
        except:
            pass

        raw_tr = datasets.MNIST(raw_downloaDir, train=True, download=True)
        raw_te = datasets.MNIST(raw_downloaDir, train=False, download=True)
        X_tr = raw_tr.data
        Y_tr = raw_tr.targets
        X_te = raw_te.data
        Y_te = raw_te.targets

        for i in range(len(X_tr)):
            x=X_tr[i].numpy()
            im = Image.fromarray(x)
            savepath = os.path.join(train_imdir,str(framecounter)+".jpeg")
            im.save(savepath)
            framecounter+=1

        for i in range(len(X_te)):
            x=X_te[i].numpy()
            im = Image.fromarray(x)
            savepath = os.path.join(test_imdir,str(framecounter)+".jpeg")
            im.save(savepath)
            framecounter+=1

        y=pd.DataFrame(Y_tr.numpy())
        y.to_csv(train_anfile,index=False)
            
        y=pd.DataFrame(Y_te.numpy())
        y.to_csv(test_anfile,index=False)


        # remove raw data
        shutil.rmtree(raw_downloaDir)
        print(f'Dataset succesfull downoladed within {data_dir}, framecounter = {framecounter}')

In [20]:
mnist_dir=Path(os.path.join(datasets_dir,"MNIST"))

# Dataset information
name = "Mnist dataset"
description = "http://yann.lecun.com/exdb/mnist/"
splits = ["train", "test"] # "val",

# Input information
input_dirs = {
    "image": mnist_dir / "images" #,
    # "objects": library_dir / "annotations",
}

library_dir=Path(library_dir)
import_dir = library_dir / DATASET_NAME #("MNIST_pixano"+"_"+str(dt))

get_MNIST(mnist_dir)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 71900068.39it/s]

Extracting /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/train-images-idx3-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 125659433.43it/s]


Extracting /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/train-labels-idx1-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 34503893.46it/s]

Extracting /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/t10k-images-idx3-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 15968590.75it/s]


Extracting /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw/t10k-labels-idx1-ubyte.gz to /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST/raw_dataset/MNIST/raw

Dataset succesfull downoladed within /home/melissap/Desktop/LAGO/3.githubs/integration/datasets/MNIST, framecounter = 70000


In [21]:
help(ImageImporter.import_dataset)

Help on function import_dataset in module pixano.data.importers.importer:

import_dataset(self, input_dirs: dict[str, pathlib.Path], import_dir: pathlib.Path, portable: bool = False) -> pixano.data.dataset.Dataset
    Import dataset to Pixano format
    
    Args:
        input_dirs (dict[str, Path]): Input directories
        import_dir (Path): Import directory
        portable (bool, optional): True to copy or download files to import directory and use relative paths. Defaults to False.
    
    Returns:
        Dataset: Imported dataset



In [22]:
importer = ImageImporter(name, description, splits)
importer.import_dataset(input_dirs, import_dir, portable=True)

[2023-11-28T11:37:32Z WARN  lance::dataset] No existing dataset at /home/melissap/_pixano_datasets_/MNIST_pixano_v1/db.lance, it will be created
[2023-11-28T11:37:32Z WARN  lance::dataset] No existing dataset at /home/melissap/_pixano_datasets_/MNIST_pixano_v1/image.lance, it will be created


Importing dataset: 0it [00:00, ?it/s]

Copying media directories:   0%|          | 0/1 [00:00<?, ?it/s]

Creating dataset info file:   0%|          | 0/1 [00:00<?, ?it/s]

Creating dataset thumbnail:   0%|          | 0/1 [00:00<?, ?it/s]

<pixano.data.dataset.Dataset at 0x7f8ce3c4ab00>

### !ERROR: Here we found and issue. explorer doesn't return a localhost port for opening Pixano GUI. A fix is required.

In [None]:
explorer = Explorer(library_dir)
explorer.display()