# Active Learning with Pixano - MNIST Dataset

> initialize vital variables

In [17]:
# Configuration variables
DATASET_NAME="MNIST_pixano_v8"
customLearnerCondaEnv="customLearner3"

# variables that could be defined 
labels_per_round=100
round = 0 # current round
learning_rate=0.001
max_epochs_per_round=100
model_name="mlp" 
strategy="AlphaMixSampling" #EntropySampling #RandomSampling
alpha_opt=True

> import ROOT dir to import pixano root module , which is the pixano directory

In [18]:
import os
import sys

def insertRootDir(ROOTDIR='pixano'):
    pardir=os.path.dirname(os.path.realpath('__file__'))

    while(os.path.basename(pardir)!=ROOTDIR):

        print(pardir)
        pardir=os.path.dirname(pardir)
        # print(os.path.basename(pardir))
    print("Inserting parent dir : ",pardir)
    sys.path.insert(0,pardir)
    return pardir

ROOTDIR = insertRootDir()

/home/melissap/Desktop/LAGO_43integrationDemo/pixano/ActiveLearning/certh_integration/231031_mnist
/home/melissap/Desktop/LAGO_43integrationDemo/pixano/ActiveLearning/certh_integration
/home/melissap/Desktop/LAGO_43integrationDemo/pixano/ActiveLearning
Inserting parent dir :  /home/melissap/Desktop/LAGO_43integrationDemo/pixano


In [14]:
from pathlib import Path
from pixano.data import ImageImporter

In [15]:
library_dir=Path('/home/melissap/_pixano_datasets_') # directory where we have install the pixano formatted dataset
import_dir = library_dir / DATASET_NAME

In [16]:
# TAKEN FROM THE MNIST.ipynb notebook
# output path for lance database
DB_PATH = library_dir / "_launce_datasets_/MNIST"
# input image path
IMG_PATH = import_dir / "media"
# Note: images have been generated by MNIST (v1) notebook, and moved here
# TODO add a cell to generate image from mnist (from keras.datasets import mnist)

mnist_importer = ImageImporter("MNIST", "MNIST dataset for AL", ["train", "test"])
mnist_importer.import_dataset(
    input_dirs={ "image": IMG_PATH },
    import_dir=DB_PATH,
    portable=True
)

Importing dataset: 0it [00:00, ?it/s]

FileNotFoundError: Generated dataset is empty. Please make sure that the paths to your media files are correct, and that they each contain subfolders for your splits.

In [19]:
import random
import lancedb
import pyarrow as pa
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from ALearner import (
    Learner,
    BaseAnnotator,
    BaseSampler,
    BaseTrainer,
    getLabels,
    getLabelledIds,
    getUnlabelledIds,
    getTaggedIds,
    getLastRound,
    ddb_str,
    custom_update
)
from pixano.utils import natural_key

2023-11-22 14:03:31.673441: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
# utility function to convert id (format "<index>.png") to index
def id_to_idx(id: str) -> int:
    return int(id.split(".")[0])
    # return int(id[0:-4])  #remove the last 4 chars (".png")

### Connect to Pixano DB
MNIST dataset should have been imported previously (see lance_importers/MNIST.ipynb)

In [21]:
mnist_db = lancedb.connect(import_dir)

## Model Trainer Object

We will get raw x_train, x_test, y_test data directly from MNIST.

2 proposed Model Trainer Objects, with same model: SimpleTrainer and IncrementalTrainer

In [22]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
num_pixels = X_train.shape[1] * X_train.shape[2]

def reshape_Xdata(x):
    #flatten images
    x = x.reshape(x.shape[0], num_pixels)
    #Convert to float
    x = x.astype('float32')
    #Normalize inputs from [0; 255] to [0; 1]
    x = x / 255
    return x

def reshape_Ydata(y):
    #Convert class vectors to binary class matrices ("one hot encoding")
    ## Doc : https://keras.io/utils/#to_categorical
    return keras.utils.to_categorical(y, num_classes=10)  # need to specify num_classes because sampler can miss some classes


#x_train = reshape_Xdata(X_train)
x_test = reshape_Xdata(X_test)
y_train = reshape_Ydata(Y_train)
y_test = reshape_Ydata(Y_test)
num_classes = y_train.shape[1]

def neural_network():
    model = Sequential()
    model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(.5))
    model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = neural_network()


class SimpleTrainer(BaseTrainer):
    # simple trainer, train on all labeled data
    def __init__(self, db, model, validation_data, avoid_overfit=False):
        self.init_weights = model.get_weights()
        self.avoid_overfit = avoid_overfit
        super().__init__(db, model, validation_data)

    # training on subset data
    def train(self, epochs, batch_size):
        # get y data (labels) and ids from db, x data (images) from raw mnist and ids
        ids = getLabelledIds(self.db)
        labels = getLabels(self.db)
        if self.avoid_overfit:
            print("Reset weights to avoid overfit")
            self.model.set_weights(self.init_weights)
        print(f"Train on {len(ids)} labelled items")
        x_train = reshape_Xdata(np.array([X_train[id_to_idx(id)] for id in ids]))
        y_train = reshape_Ydata(np.array(labels))
        self.model.fit(x_train, y_train, validation_data=self.validation_data, epochs=epochs, batch_size=batch_size)
        scores = model.evaluate(self.validation_data[0], self.validation_data[1])
        print("Neural network accuracy: %.2f%%" % (scores[1]*100))
        return {
            "score": scores[1]*100
        }

class IncrementalTrainer(BaseTrainer):
    #in this trainer we train only on last round
    def __init__(self, db, model, validation_data):
        self.initial_epoch = 0
        super().__init__(db, model, validation_data)

    # training on subset data
    def train(self, epochs, batch_size):
        # get y data (labels) and ids from db, x data (images) from raw mnist and ids
        round = getLastRound(self.db)
        ids = getLabelledIds(self.db, round)
        labels = getLabels(self.db, round)
        print(f"Train on {len(ids)} labelled items. initial epoch = {self.initial_epoch}")
        x_train = reshape_Xdata(np.array([X_train[id_to_idx(id)] for id in ids]))
        y_train = reshape_Ydata(np.array(labels))
        self.model.fit(x_train, y_train, validation_data=self.validation_data, epochs=self.initial_epoch+epochs, batch_size=batch_size, initial_epoch=self.initial_epoch)
        scores = model.evaluate(self.validation_data[0], self.validation_data[1])
        print("Neural network accuracy: %.2f%%" % (scores[1]*100))
        # update initial_epoch for next round
        self.initial_epoch += epochs
        return {
            "score": scores[1]*100
        }

2023-11-22 14:03:55.360936: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


## Query Sampler Object
<!-- RandomSampler or SequentialSampler -->

In [23]:
# input: ids (whole dataset, or filtered (here: train only))
# output: candidates

class RandomSampler(BaseSampler):

    def query(self, n_candidates=10):
        ids = getUnlabelledIds(self.db, split="train")
        return random.sample(ids, n_candidates)

class SequentialSampler(BaseSampler):

    def query(self, n_candidates=10):
        ids = getUnlabelledIds(self.db, split="train")
        return sorted(ids, key=int)[0:n_candidates]

#### Custom Trainer

> prepare the directories for data exchange between pixano and annotation tool

In [24]:
import subprocess

# TEMPORARY SOLUTION
def create_dir(path):
    try:
        os.makedirs(path)
    except:
        print(f'Dir {path} exists already')
    return path
# here define the paths of exchanging data between pixano and the customLearner
temp_data_exchange_dir = create_dir(os.path.join(ROOTDIR,"temp_data"))                # define a directory for exchanging data
output_queDir=create_dir(os.path.join(temp_data_exchange_dir,"output_queries"))       # [out] query strategy results
output_accDir=create_dir(os.path.join(temp_data_exchange_dir,"output_accuracy"))      # [out] accuracy results 

> define the trainer

In [25]:
class customTrainer():

    weights_dir="_weights"
    # batch_size = 16
    learning_rate=0.001
    n_epoch=100
    model="mlp" 
    
    mode='train'
    customLearnerCondaEnv = "customLearner3"

    #in this trainer we train only on last round
    def __init__(self, db, model, validation_data, **kwargs):
        self.db = db
        self.validation_data = validation_data # ---------------------------> remove later
        self.initial_epoch = 0

        # sets new values to any default arguments passed during construction    
        for key, value in kwargs.items():
            if hasattr(self, key):
                self.set_parameter(key,value)
    
    def set_parameter(self,key,value):
        # change member variable members. Public method that can be used outside the scope of the scope
        if hasattr(self, key):
            setattr(self, key, value)
        else:
            print(f'Argument {key} does not exist. Value of {value} does not set any of the member values of the customTrainer class')

    # training on subset data
    def train(self, epochs, batch_size):

        curRound = getLastRound(self.db)
        # csvAcc=os.path.join(output_accDir,"accuracy"+str(curRound)+".csv")
        csvAcc=os.path.join(output_accDir,"accuracy.csv")

        arguments = f"--data_name {DATASET_NAME} --mode {self.mode} --mode train --train_out {csvAcc} --data_dir {import_dir} --n_query {labels_per_round} --learning_rate {learning_rate} --n_epoch {max_epochs_per_round} --model {model_name} --strategy {strategy} --alpha_opt"
        subprocess.run(f"""source ~/miniconda3/etc/profile.d/conda.sh
            conda activate {self.customLearnerCondaEnv} 
            python alpha_mix_active_learning/_main.py {arguments}""", #{customLearner_ROOTDIR}/customLearner_main_3
            shell=True, executable='/bin/bash', check=True)

        trainOut = pd.read_csv(csvAcc,index_col=0)
        return {
            "score": 100 * trainOut.loc["round_"+str(curRound),"accuracy"]
        }

#### CERTH - Custom Learner

In [26]:
# here define the implementation for the new sampler
class customSampler(BaseSampler):
    
    #add all other dependencies define in https://docs.google.com/document/d/1NlArhWYjePzB43sR4HCUc_4xBU73Up9OI24hIyPx0zY/edit

    # for now only the vital ones
    output_dir="_output"
    log_directory="_logs"
    n_init_lb=100
    n_query=100 
    alpha_opt=True
    mode = "query"
    stategy = "AlphaMixSampling" #EntropySampling #RandomSampling
    model = "mlp"
    customLearnerCondaEnv = "customLearner3"

    def __init__(self, dataset, **kwargs):
        super().__init__(dataset)

        # sets new values to any default arguments passed during construction    
        for key, value in kwargs.items():
            if hasattr(self, key):
                self.set_parameter(key,value)

    def set_parameter(self,key,value):
        # change member variable members. Public method that can be used outside the scope of the scope
        if hasattr(self, key):
            setattr(self, key, value)
        else:
            print(f'Argument {key} does not exist. Value of {value} does not set any of the member values of the customSampler class')

    def query(self, discard_n_candidates=10):
        # under active development
        round = getLastRound(self.db)

        # if (round == -1):                                                   # random sampling when labels are absent
        #     ids = getLabelledIds(self.db, round)
        #     return random.sample(ids, labels_per_round)
        # elif (round >= 0):
        curRound = getLastRound(self.db)

        csvQue=os.path.join(output_queDir,"queries_"+str(curRound)+".csv")

        arguments = f"--data_name {DATASET_NAME} --data_dir {import_dir} --mode {self.mode} --query_out {csvQue} --n_query {labels_per_round} --model {model_name} --strategy {strategy} --alpha_opt"
        subprocess.run(f"""source ~/miniconda3/etc/profile.d/conda.sh
                    conda activate {self.customLearnerCondaEnv} 
                    python alpha_mix_active_learning/_main.py {arguments}""",
                    shell=True, executable='/bin/bash', check=True)
        
        queryOut = pd.read_csv(csvQue,index_col=0)
        
        return queryOut["query_results"].tolist()

## Labeling Interface Objects

Human labeling with Pixano Annotator is built-in, here we specify an Auto Annotator

In [27]:
class AutoAnnotator(BaseAnnotator):
    # custom annotation function
    # as we have ground truth for MNIST, we can autofill
    def annotate(self, round):
        candidates = getTaggedIds(self.db, round)
        db_tbl = mnist_db.open_table("db")
        custom_update(db_tbl, f"id in ({ddb_str(candidates)})", 'label', [str(Y_train[id_to_idx(candidate)]) for candidate in sorted(candidates, key=natural_key)])
        print(f"AutoAnnotator: round {round} annotated.")

In [28]:
mnist_db.open_table("db")

LanceTable(db)

## Orchestrator

### Initial Learning

In [29]:
# train on all data, don't reset weights before each training round (it may overfit on first rounds data)
# myTrainer = SimpleTrainer(mnist_db, model, (x_test, y_test))

# train on all data, reset weights before each training round
#myTrainer = SimpleTrainer(mnist_db, model, (x_test, y_test), avoid_overfit=True) 

# train on candidates data (without resetting weights obviously)
# myTrainer = IncrementalTrainer(mnist_db, model, (x_test, y_test))
myTrainer = customTrainer(mnist_db, model, (x_test, y_test))

# randomSampler = RandomSampler(mnist_db)
randomSampler = customSampler(mnist_db,n_query=200, irrelevant=5)

autofillAnnotator = AutoAnnotator(mnist_db)

epochs = 1

In [31]:
init_learner = Learner(
    db=mnist_db,
    trainer=myTrainer,
    sampler=randomSampler,
    custom_annotator=autofillAnnotator,
    new_al=True,
    verbose=0
)
candidates = init_learner.query(round, labels_per_round)
init_learner.annotate(round)
init_learner.train(round, epochs=epochs)

# round += 1



###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7fa81024ff70>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='query', data_name='MNIST_pixano_v8', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out=None, query_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_queries/queries_-1.csv', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_cap=0.03125, alpha_opt=True, alpha_closed_form_approx=True, alpha_learning_rate=0.1, alp

 26%|██▌       | 26/100 [00:28<01:21,  1.10s/it]

Reached max accuracy at epoch 26 


 26%|██▌       | 26/100 [00:29<01:25,  1.15s/it]


Round 0
testing accuracy 0.07792207792207792


{'score': 7.79220779220779}

In [32]:
candidates = init_learner.query(round, labels_per_round)




###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7fb73eb67e50>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='query', data_name='MNIST_pixano_v8', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out=None, query_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_queries/queries_0.csv', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_cap=0.03125, alpha_opt=True, alpha_closed_form_approx=True, alpha_learning_rate=0.1, alph

  super()._check_params_vs_input(X, default_n_init=10)


number of samples that are misclassified and selected: 100 (100.00%)
Log Determinant of the Gram Matrix: 409.295441
Signed Log Determinant of the Gram Matrix: 409.295441
Confidence: 0.324890
Margin: 0.007934
Predicted Entropy: 2.264852
GT Entropy: nan
Border Entropy: 3.553058


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


Round 0 tagged


We add some auto-annotation rounds

In [19]:
auto_rounds = 3
round_size = 20
for round in range(round, round+auto_rounds):
    candidates = init_learner.query(round, round_size)
    init_learner.annotate(round)
    init_learner.train(round, epochs=epochs)



###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7fbe3ffb4550>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='query', data_name='MNIST_pixano_v7', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out=None, query_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_queries/queries_0.csv', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_cap=0.03125, alpha_opt=True, alpha_closed_form_approx=True, alpha_learning_rate=0.1, alph

  super()._check_params_vs_input(X, default_n_init=10)


number of samples that are misclassified and selected: 100 (100.00%)
Log Determinant of the Gram Matrix: 310.990723
Signed Log Determinant of the Gram Matrix: 310.990723
Confidence: 0.302082
Margin: 0.004412
Predicted Entropy: 2.174998
GT Entropy: nan
Border Entropy: 3.317727


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


Round 0 tagged
223 candidates on round 0
AutoAnnotator: round 0 annotated.


###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7f8fa8594520>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='train', data_name='MNIST_pixano_v7', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_accuracy/accuracy.csv', query_out='path to the file', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_

 34%|███▍      | 34/100 [00:33<01:02,  1.06it/s]

Reached max accuracy at epoch 34 


 34%|███▍      | 34/100 [00:34<01:07,  1.02s/it]


Round 0
testing accuracy 0.0


###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7fa02817c5e0>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='query', data_name='MNIST_pixano_v7', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out=None, query_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_queries/queries_0.csv', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_cap=0.03125, alpha_opt=True, alpha_closed_form_approx=True, 

  super()._check_params_vs_input(X, default_n_init=10)


number of samples that are misclassified and selected: 100 (100.00%)
Log Determinant of the Gram Matrix: 364.347290
Signed Log Determinant of the Gram Matrix: 364.347290
Confidence: 0.327119
Margin: 0.006281
Predicted Entropy: 2.259030
GT Entropy: nan
Border Entropy: 3.518425


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


Round 1 tagged
122 candidates on round 1
AutoAnnotator: round 1 annotated.


###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7fa9baba0610>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='train', data_name='MNIST_pixano_v7', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_accuracy/accuracy.csv', query_out='path to the file', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_

 12%|█▏        | 12/100 [00:13<01:28,  1.01s/it]

Reached max accuracy at epoch 12 


 12%|█▏        | 12/100 [00:13<01:42,  1.17s/it]


Round 0
testing accuracy 0.06666666666666667


###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7f8a501cc610>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='query', data_name='MNIST_pixano_v7', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out=None, query_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_queries/queries_1.csv', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_cap=0.03125, alpha_opt=True, alpha_closed_fo

  super()._check_params_vs_input(X, default_n_init=10)


number of samples that are misclassified and selected: 100 (100.00%)
Log Determinant of the Gram Matrix: 386.348907
Signed Log Determinant of the Gram Matrix: 386.348907
Confidence: 0.314974
Margin: 0.006477
Predicted Entropy: 2.227732
GT Entropy: nan


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


Border Entropy: 3.510777
Round 2 tagged
117 candidates on round 2
AutoAnnotator: round 2 annotated.


###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7fc3def74580>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='train', data_name='MNIST_pixano_v7', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_accuracy/accuracy.csv', query_out='path to the file', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=

 16%|█▌        | 16/100 [00:17<01:34,  1.12s/it]

Reached max accuracy at epoch 16 


 16%|█▌        | 16/100 [00:18<01:37,  1.16s/it]


Round 0
testing accuracy 0.08064516129032258


### Active Learning - Human annotation with Pixano Annotator

Here we use a different Learner for human annotation. Trainer Object use the same model so we keep training it

In [35]:
pix_rounds = 3

learner_pix = Learner(
    db=mnist_db,
    trainer=myTrainer,
    sampler=randomSampler
)
for round in range(round + 1, pix_rounds + round + 1):
    candidates = learner_pix.query(round , "pix_round_size - defined_within_class")
    # if aborted, we must untag the current round 
    try:
        learner_pix.annotate(round)
    except KeyboardInterrupt:
        learner_pix.untagRound(round)
        round = round - 1
        print("Interrupted, current round has been canceled, and round labels erased")
        break
    result = learner_pix.train(round, epochs=epochs)
    print("result", result)




###################################################### INTERNAL GPU CHECK ######################################################


is_available  True
device_count  1
current device  0
cuda.device  <torch.cuda.device object at 0x7f5cfd37fe20>
device name  NVIDIA GeForce RTX 3090


################################################################################################################################


Namespace(mode='query', data_name='MNIST_pixano_v8', n_label=10, data_dir='/home/melissap/_pixano_datasets_/MNIST_pixano_v7', train_out=None, query_out='/home/melissap/Desktop/LAGO_43integrationDemo/pixano/temp_data/output_queries/queries_0.csv', log_dir='_logs', save_checkpoints=False, save_images=False, print_to_file=False, seeds=[1, 10, 100, 1000, 10000], init_lb_method='general_random', n_query=100, query_growth_ratio=1, strategy='AlphaMixSampling', n_drop=5, eps=0.05, max_iter=50, alpha_cap=0.03125, alpha_opt=True, alpha_closed_form_approx=True, alpha_learning_rate=0.1, alph

  super()._check_params_vs_input(X, default_n_init=10)


number of samples that are misclassified and selected: 100 (100.00%)
Log Determinant of the Gram Matrix: 409.295441
Signed Log Determinant of the Gram Matrix: 409.295441
Confidence: 0.324890
Margin: 0.007934
Predicted Entropy: 2.264852
GT Entropy: nan
Border Entropy: 3.553058


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


Round 2 tagged
115 candidates on round 2
115 items to annotate on round 2: ['23776.jpeg', '24055.jpeg', '24297.jpeg', '24548.jpeg', '41605.jpeg', '41922.jpeg', '58321.jpeg', '58366.jpeg', '43076.jpeg', '59850.jpeg', '9962.jpeg', '26238.jpeg', '42208.jpeg', '42603.jpeg', '42867.jpeg', '44517.jpeg', '44641.jpeg', '44821.jpeg', '426.jpeg', '649.jpeg', '7691.jpeg', '7822.jpeg', '7863.jpeg', '7894.jpeg', '8007.jpeg', '10283.jpeg', '10482.jpeg', '10621.jpeg', '10908.jpeg', '11090.jpeg', '12398.jpeg', '12997.jpeg', '13126.jpeg', '28441.jpeg', '11526.jpeg', '11868.jpeg', '12233.jpeg', '45293.jpeg', '45300.jpeg', '45501.jpeg', '1884.jpeg', '13405.jpeg', '30042.jpeg', '46149.jpeg', '47099.jpeg', '3037.jpeg', '30807.jpeg', '31451.jpeg', '47107.jpeg', '47837.jpeg', '3945.jpeg', '15708.jpeg', '15713.jpeg', '15977.jpeg', '16297.jpeg', '18838.jpeg', '19371.jpeg', '35011.jpeg', '35279.jpeg', '1884.jpeg', '18285.jpeg', '18286.jpeg', '34065.jpeg', '34447.jpeg', '34548.jpeg', '7090.jpeg', '426.jpeg', '64