In [1]:
from datasets import load_cifar10
import numpy as np
import pandas as pd

from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [2]:
X_train, y_train, X_val, y_val, X_test, y_test = load_cifar10()

In [6]:
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print()
print('X_val shape: ', X_val.shape)
print('y_val shape: ', y_val.shape)
print()
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

X_train shape:  (40000, 32, 32, 3)
y_train shape:  (40000,)

X_val shape:  (10000, 32, 32, 3)
y_val shape:  (10000,)

X_test shape:  (10000, 32, 32, 3)
y_test shape:  (10000,)


In [8]:
labels_dict = {0: 'airplane', 1: 'automobile', 2: 'bird', 3: 'cat', 4: 'deer',
              5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship', 9: 'truck'}

df = pd.DataFrame({
    'true_label': y_train,
    'true_label_text': pd.Series(y_train).map(labels_dict),
    'pred_label': np.NaN,
    'pred_label_text': np.NaN,
    'is_annotated': False,
    'annotation_batch': np.NaN,
    'sampling_method': np.NaN
})
df.sample(10)

Unnamed: 0,true_label,true_label_text,pred_label,pred_label_text,is_annotated,annotation_batch,sampling_method
25569,6,frog,,,False,,
28140,9,truck,,,False,,
24292,1,automobile,,,False,,
23352,6,frog,,,False,,
38675,4,deer,,,False,,
16209,4,deer,,,False,,
30610,0,airplane,,,False,,
3277,8,ship,,,False,,
28181,9,truck,,,False,,
13610,8,ship,,,False,,


In [75]:
# build a model
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [51]:
Y_train = to_categorical(y_train)
Y_val = to_categorical(y_val)
Y_test = to_categorical(y_test)

In [137]:
# sample 100 pics randomly for starters
sample_ix = df.sample(1000).index
# df.loc[sample_ix]['is_annotated'] = True

In [157]:
df.loc[sample_ix, 'is_annotated'] = True
df.loc[sample_ix, 'sampling_method'] = 'random'
df.loc[sample_ix, 'annotation_batch'] = 1

In [159]:
# fit the model
h = model.fit(X_train[sample_ix], Y_train[sample_ix], validation_data=(X_val, Y_val), batch_size=32, epochs=1, verbose=1)



In [34]:
def margin_of_confidence_score(prob_dist):
    prob_dist[::-1].sort()
    difference = prob_dist[0] - prob_dist[1]
    return 1 - difference

In [35]:
def uncertainty_scores(predictions):
    scores = []
    for prob_dist in predictions:
        score = margin_of_confidence_score(prob_dist)
        scores.append(score)
    return pd.Series(scores) 

In [162]:
preds = model.predict(X_train)

In [168]:
unc = uncertainty_scores(preds)

In [176]:
unc_idx = unc.sort_values(ascending=False).iloc[:100].index

False    98
True      2
Name: is_annotated, dtype: int64

In [147]:
pd.DataFrame(Y_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9996,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
def run_experiment(model, data, annotation_batch_size=100, uncertainty_sample_size=0.45, diversity_sample_size=0.45):
    pass
    

In [71]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping

class ActiveLearningExperiment():
    def __init__(self, model, data, label_encoding=None, annotation_batch_size=100, uncertainty_sample_size=0.45, diversity_sample_size=0.45):
        self.model = model
        self.data = data
        self.label_encoding = label_encoding
        self.annotation_batch_size = annotation_batch_size
        self.uncertainty_sample_size = uncertainty_sample_size
        self.diversity_sample_size = diversity_sample_size
        self.random_sample_size = 1 - uncertainty_sample_size - diversity_sample_size
        
        self.X_train = data[0][0]
        self.y_train = data[0][1]
        self.X_val = data[1][0]
        self.y_val = data[1][1]
        self.X_test = data[2][0]
        self.y_test = data [2][1]
        
        self.y_train_oh = to_categorical(self.y_train)
        self.y_val_oh = to_categorical(self.y_val)
        self.y_test_oh = to_categorical(self.y_test)
        
    def run(self):
        # initialize status df
        self.__initialize_status_df()
        
        self.test_acc = []
        
        # get the first set of samples to annotate
        samples_ix = self.df_status.sample(self.annotation_batch_size).index
        self.df_status.loc[samples_ix, 'is_annotated'] = True
        self.df_status.loc[samples_ix, 'annotation_batch'] = 1
        self.df_status.loc[samples_ix, 'sampling_method'] = 'random'
        
        # train the model
        early_stop_callback = EarlyStopping(patience=3)
        
        for i in range(1, 1 + self.X_train.shape[0] // self.annotation_batch_size):
            print('Batch ', i)
            samples_ix = self.__get_samples(i)
            h = model.fit(self.X_train[samples_ix], self.y_train_oh[samples_ix], validation_data=(self.X_val, self.y_val_oh), batch_size=32, epochs=100, verbose=1, callbacks=[early_stop_callback])
            print(len(h.history['val_loss']), ' epochs were run.')
            self.test_acc.append(self.model.evaluate(self.X_test, self.y_test_oh)[1])
            
    def __get_samples(self, batch_number):
        if batch_number == 1:
            samples_ix = self.df_status.sample(self.annotation_batch_size).index
            self.df_status.loc[samples_ix, 'is_annotated'] = True
            self.df_status.loc[samples_ix, 'annotation_batch'] = batch_number
            self.df_status.loc[samples_ix, 'sampling_method'] = 'random'
            return samples_ix
        
        preds = self.model.predict(self.X_train)
        num_unc = int(self.annotation_batch_size * self.uncertainty_sample_size)
        unc_scores = self.uncertainty_scores(preds)
        unc_ix = unc_scores[~self.df_status['is_annotated']].sort_values(ascending=False).iloc[:num_unc].index.tolist()
        self.df_status.loc[unc_ix, 'is_annotated'] = True
        self.df_status.loc[unc_ix, 'annotation_batch'] = batch_number
        self.df_status.loc[unc_ix, 'sampling_method'] = 'uncertainty'
        
        num_rand = self.annotation_batch_size - num_unc
        rand_ix = self.df_status[~self.df_status['is_annotated']].sample(num_rand).index.tolist()
        self.df_status.loc[rand_ix, 'is_annotated'] = True
        self.df_status.loc[rand_ix, 'annotation_batch'] = batch_number
        self.df_status.loc[rand_ix, 'sampling_method'] = 'uncertainty'
        
        return unc_ix + rand_ix
        
    def __initialize_status_df(self):
        self.df_status = pd.DataFrame({
            'true_label': y_train,
            'true_label_text': pd.Series(y_train).map(self.label_encoding) if self.label_encoding else y_train,
            'pred_label': np.NaN,
            'pred_label_text': np.NaN,
            'is_annotated': False,
            'annotation_batch': np.NaN,
            'sampling_method': np.NaN
        })
        
    def uncertainty_scores(self, predictions):
        scores = []
        for prob_dist in predictions:
            score = margin_of_confidence_score(prob_dist)
            scores.append(score)
        return pd.Series(scores)
    

In [73]:
exp = ActiveLearningExperiment(model, [(X_train, y_train), (X_val, y_val), (X_test, y_test)], labels_dict, 100)

In [59]:
model.evaluate(X_test, Y_test)



[77.48824310302734, 0.10050000250339508]

In [74]:
exp.run()

Batch  1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
8  epochs were run.
Batch  2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
6  epochs were run.
Batch  5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  6
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
4  epochs were run.
Batch  7
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
6  epochs were run.
Batch  8
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  9
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  10
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
6  epochs were run.
Batch  11
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epo

KeyboardInterrupt: 

In [76]:
exp2 = ActiveLearningExperiment(model, [(X_train, y_train), (X_val, y_val), (X_test, y_test)], labels_dict, 100, 0.001)

In [None]:
exp2.run()

Batch  1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
8  epochs were run.
Batch  2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
4  epochs were run.
Batch  3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
4  epochs were run.
Batch  6
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  7
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
6  epochs were run.
Batch  8
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  9
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  epochs were run.
Batch  10
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
6  epochs were run.
Batch  11
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
5  

In [45]:
ix_1 = df.sample(10).index.tolist()
ix_2 = df.sample(10).index.tolist()

In [48]:
ix_1 + ix_2

[38344,
 32854,
 11157,
 36640,
 23091,
 16775,
 15179,
 31162,
 9522,
 5823,
 14001,
 38836,
 20826,
 20119,
 29341,
 38280,
 7162,
 35771,
 34312,
 36828]