# Active Learning with Pixano - MNIST Dataset

In [1]:
DATASET_NAME="MNIST_pixano_v4"

In [2]:
# import ROOT dir to import pixano root module
import os
import sys

def insertRootDir(ROOTDIR='pixano'):
    pardir=os.path.dirname(os.path.realpath('__file__'))

    while(os.path.basename(pardir)!=ROOTDIR):

        print(pardir)
        pardir=os.path.dirname(pardir)
        # print(os.path.basename(pardir))
    print("Inserting parent dir : ",pardir)
    sys.path.insert(0,pardir)

insertRootDir()

/home/melissap/Desktop/LAGO/3.githubs/integration/2.integrateAL/pixano/dev/INTEGRATION/231031_mnist
/home/melissap/Desktop/LAGO/3.githubs/integration/2.integrateAL/pixano/dev/INTEGRATION
/home/melissap/Desktop/LAGO/3.githubs/integration/2.integrateAL/pixano/dev
Inserting parent dir :  /home/melissap/Desktop/LAGO/3.githubs/integration/2.integrateAL/pixano


In [3]:
from pathlib import Path
from pixano.data import ImageImporter

In [4]:
library_dir=Path('/home/melissap/_pixano_datasets_')
import_dir = library_dir / DATASET_NAME


In [5]:
# TAKEN FROM THE MNIST.ipynb notebook
# output path for lance database
DB_PATH = library_dir / "_launce_datasets_/MNIST"
# input image path
IMG_PATH = import_dir / "media"
# Note: images have been generated by MNIST (v1) notebook, and moved here
# TODO add a cell to generate image from mnist (from keras.datasets import mnist)

mnist_importer = ImageImporter("MNIST", "MNIST dataset for AL", ["train", "test"])
mnist_importer.import_dataset(
    input_dirs={ "image": IMG_PATH },
    import_dir=DB_PATH,
    portable=True
)

Importing dataset: 0it [00:00, ?it/s]

Copying media directories:   0%|          | 0/1 [00:00<?, ?it/s]

Creating dataset info file:   0%|          | 0/1 [00:00<?, ?it/s]

<pixano.data.dataset.Dataset at 0x7f6c05f50280>

In [6]:
import random
import lancedb
import pyarrow as pa
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from ALearner import (
    Learner,
    BaseAnnotator,
    BaseSampler,
    BaseTrainer,
    getLabels,
    getLabelledIds,
    getUnlabelledIds,
    getTaggedIds,
    getLastRound,
    ddb_str,
    custom_update
)
from pixano.utils import natural_key

2023-11-17 11:23:38.435042: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# utility function to convert id (format "<index>.png") to index
def id_to_idx(id: str) -> int:
    return int(id.split(".")[0])
    # return int(id[0:-4])  #remove the last 4 chars (".png")

### Connect to Pixano DB
MNIST dataset should have been imported previously (see lance_importers/MNIST.ipynb)

In [8]:
mnist_db = lancedb.connect(import_dir)

## Model Trainer Object

We will get raw x_train, x_test, y_test data directly from MNIST.

2 proposed Model Trainer Objects, with same model: SimpleTrainer and IncrementalTrainer

In [9]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
num_pixels = X_train.shape[1] * X_train.shape[2]

def reshape_Xdata(x):
    #flatten images
    x = x.reshape(x.shape[0], num_pixels)
    #Convert to float
    x = x.astype('float32')
    #Normalize inputs from [0; 255] to [0; 1]
    x = x / 255
    return x

def reshape_Ydata(y):
    #Convert class vectors to binary class matrices ("one hot encoding")
    ## Doc : https://keras.io/utils/#to_categorical
    return keras.utils.to_categorical(y, num_classes=10)  # need to specify num_classes because sampler can miss some classes


#x_train = reshape_Xdata(X_train)
x_test = reshape_Xdata(X_test)
y_train = reshape_Ydata(Y_train)
y_test = reshape_Ydata(Y_test)
num_classes = y_train.shape[1]

def neural_network():
    model = Sequential()
    model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(.5))
    model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = neural_network()


class SimpleTrainer(BaseTrainer):
    # simple trainer, train on all labeled data
    def __init__(self, db, model, validation_data, avoid_overfit=False):
        self.init_weights = model.get_weights()
        self.avoid_overfit = avoid_overfit
        super().__init__(db, model, validation_data)

    # training on subset data
    def train(self, epochs, batch_size):
        # get y data (labels) and ids from db, x data (images) from raw mnist and ids
        ids = getLabelledIds(self.db)
        labels = getLabels(self.db)
        if self.avoid_overfit:
            print("Reset weights to avoid overfit")
            self.model.set_weights(self.init_weights)
        print(f"Train on {len(ids)} labelled items")
        x_train = reshape_Xdata(np.array([X_train[id_to_idx(id)] for id in ids]))
        y_train = reshape_Ydata(np.array(labels))
        self.model.fit(x_train, y_train, validation_data=self.validation_data, epochs=epochs, batch_size=batch_size)
        scores = model.evaluate(self.validation_data[0], self.validation_data[1])
        print("Neural network accuracy: %.2f%%" % (scores[1]*100))
        return {
            "score": scores[1]*100
        }

class IncrementalTrainer(BaseTrainer):
    #in this trainer we train only on last round
    def __init__(self, db, model, validation_data):
        self.initial_epoch = 0
        super().__init__(db, model, validation_data)

    # training on subset data
    def train(self, epochs, batch_size):
        # get y data (labels) and ids from db, x data (images) from raw mnist and ids
        round = getLastRound(self.db)
        ids = getLabelledIds(self.db, round)
        labels = getLabels(self.db, round)
        print(f"Train on {len(ids)} labelled items. initial epoch = {self.initial_epoch}")
        x_train = reshape_Xdata(np.array([X_train[id_to_idx(id)] for id in ids]))
        y_train = reshape_Ydata(np.array(labels))
        self.model.fit(x_train, y_train, validation_data=self.validation_data, epochs=self.initial_epoch+epochs, batch_size=batch_size, initial_epoch=self.initial_epoch)
        scores = model.evaluate(self.validation_data[0], self.validation_data[1])
        print("Neural network accuracy: %.2f%%" % (scores[1]*100))
        # update initial_epoch for next round
        self.initial_epoch += epochs
        return {
            "score": scores[1]*100
        }

2023-11-17 11:23:39.711406: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


## Query Sampler Object
<!-- RandomSampler or SequentialSampler -->

In [10]:
# input: ids (whole dataset, or filtered (here: train only))
# output: candidates

class RandomSampler(BaseSampler):

    def query(self, n_candidates=10):
        ids = getUnlabelledIds(self.db, split="train")
        return random.sample(ids, n_candidates)

class SequentialSampler(BaseSampler):

    def query(self, n_candidates=10):
        ids = getUnlabelledIds(self.db, split="train")
        return sorted(ids, key=int)[0:n_candidates]

## Labeling Interface Objects

Human labeling with Pixano Annotator is built-in, here we specify an Auto Annotator

In [11]:
class AutoAnnotator(BaseAnnotator):
    # custom annotation function
    # as we have ground truth for MNIST, we can autofill
    def annotate(self, round):
        candidates = getTaggedIds(self.db, round)
        db_tbl = mnist_db.open_table("db")
        custom_update(db_tbl, f"id in ({ddb_str(candidates)})", 'label', [str(Y_train[id_to_idx(candidate)]) for candidate in sorted(candidates, key=natural_key)])
        print(f"AutoAnnotator: round {round} annotated.")

## Orchestrator

### Initial Learning

In [12]:
# train on all data, don't reset weights before each training round (it may overfit on first rounds data)
# myTrainer = SimpleTrainer(mnist_db, model, (x_test, y_test))

# train on all data, reset weights before each training round
#myTrainer = SimpleTrainer(mnist_db, model, (x_test, y_test), avoid_overfit=True) 

# train on candidates data (without resetting weights obviously)
myTrainer = IncrementalTrainer(mnist_db, model, (x_test, y_test))

randomSampler = RandomSampler(mnist_db)
autofillAnnotator = AutoAnnotator(mnist_db)

epochs = 1

In [13]:
init_round_size = 50
round = 0

init_learner = Learner(
    db=mnist_db,
    trainer=myTrainer,
    sampler=randomSampler,
    custom_annotator=autofillAnnotator,
    new_al=True,
    verbose=0
)
candidates = init_learner.query(round, init_round_size)
init_learner.annotate(round)
init_learner.train(round, epochs=epochs)

round += 1

Round 0 tagged
57 candidates on round 0
AutoAnnotator: round 0 annotated.
Train on 71 labelled items. initial epoch = 0
Neural network accuracy: 9.66%


In [14]:
candidates = init_learner.query(round, init_round_size)


Round 1 tagged


We add some auto-annotation rounds

In [15]:
auto_rounds = 3
round_size = 20
for round in range(round, round+auto_rounds):
    candidates = init_learner.query(round, round_size)
    init_learner.annotate(round)
    init_learner.train(round, epochs=epochs)

Round 1 tagged
85 candidates on round 1
AutoAnnotator: round 1 annotated.
Train on 115 labelled items. initial epoch = 1
Epoch 2/2
Neural network accuracy: 5.91%
Round 2 tagged
21 candidates on round 2
AutoAnnotator: round 2 annotated.
Train on 23 labelled items. initial epoch = 2
Epoch 3/3
Neural network accuracy: 7.10%
Round 3 tagged
23 candidates on round 3
AutoAnnotator: round 3 annotated.
Train on 29 labelled items. initial epoch = 3
Epoch 4/4
Neural network accuracy: 8.40%


### Active Learning - Human annotation with Pixano Annotator

Here we use a different Learner for human annotation. Trainer Object use the same model so we keep training it

In [16]:
pix_rounds = 2
pix_round_size = 3
learner_pix = Learner(
    db=mnist_db,
    trainer=myTrainer,
    sampler=randomSampler
)
for round in range(round + 1, pix_rounds + round + 1):
    candidates = learner_pix.query(round, pix_round_size)
    # if aborted, we must untag the current round 
    try:
        learner_pix.annotate(round)
    except KeyboardInterrupt:
        learner_pix.untagRound(round)
        round = round - 1
        print("Interrupted, current round has been canceled, and round labels erased")
        break
    result = learner_pix.train(round, epochs=epochs)
    print("result", result)


Round 4 tagged
4 candidates on round 4
4 items to annotate on round 4: ['8127.jpeg', '17773.jpeg', '36560.jpeg', '8127.jpeg']
Interrupted, current round has been canceled, and round labels erased
