<a href="https://colab.research.google.com/github/consequencesunintended/Pseudo-Labelling/blob/master/Pseudo_Labelling_MNIST_3rd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Pseudo Labelling on MNIST dataset**

References:

*1 - Pseudo-Label : The Simple and Efficient Semi-Supervised Learning
Method for Deep Neural Networks, Dong-Hyun Lee
http://deeplearning.net/wp-content/uploads/2013/03/pseudo_label_final.pdf*

*2 - Naive semi-supervised deep learning using pseudo-label, Zhun Li, ByungSoo Ko & Ho-Jin Choi
https://link.springer.com/article/10.1007/s12083-018-0702-9*

*3 - The Illustrated FixMatch for Semi-Supervised Learning
https://amitness.com/2020/03/fixmatch-semi-supervised/*

In [0]:
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Softmax, Dropout, Conv2D, MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras import losses
from tensorflow.keras import Sequential
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt

In [0]:
from keras.datasets import mnist
import numpy as np
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [0]:
classifier = Sequential()
classifier.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(28, 28, 1)))
classifier.add(Conv2D(64, (3, 3), activation='relu'))
classifier.add(MaxPooling2D(pool_size=(2, 2)))
classifier.add(Dropout(0.25))
classifier.add(Flatten())
classifier.add(Dense(128, activation='relu'))
classifier.add(Dropout(0.5))
classifier.add(Dense(10, activation='softmax'))

classifier.build()

In [0]:
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape( (len(x_train),28,28,1))
x_test = x_test.reshape( (len(x_test),28,28,1))

# **Split the data**
The idea here is to have a small labelled dataset and a large unlablled set to help improve increasing the accuracy of the model in scenario were we dont have access to large labelled set. We also set aside some data for evaluation test. We are making sure they are equal number of samples for each number exist in each dataset

In [0]:
data_0 = [ x for x,y in zip(x_train,y_train) if y == 0 ] 
data_1 = [ x for x,y in zip(x_train,y_train) if y == 1 ] 
data_2 = [ x for x,y in zip(x_train,y_train) if y == 2 ] 
data_3 = [ x for x,y in zip(x_train,y_train) if y == 3 ] 
data_4 = [ x for x,y in zip(x_train,y_train) if y == 4 ] 
data_5 = [ x for x,y in zip(x_train,y_train) if y == 5 ] 
data_6 = [ x for x,y in zip(x_train,y_train) if y == 6 ] 
data_7 = [ x for x,y in zip(x_train,y_train) if y == 7 ] 
data_8 = [ x for x,y in zip(x_train,y_train) if y == 8 ] 
data_9 = [ x for x,y in zip(x_train,y_train) if y == 9 ] 

In [0]:
source = 0
target = 10
x_train_shorten = data_0[source:target] + data_1[source:target] + data_2[source:target] + data_3[source:target] + data_4[source:target] + data_5[source:target] + data_6[source:target] + data_7[source:target] + data_8[source:target] + data_9[source:target]
y_train_shorten = list(np.zeros(target)) + list(np.ones(target)) + list( np.ones(target) * 2 ) + list( np.ones(target) * 3 ) + list( np.ones(target) * 4 ) + list( np.ones(target) * 5 ) + list( np.ones(target) * 6 ) + list( np.ones(target) * 7 ) + list( np.ones(target) * 8 ) + list( np.ones(target) * 9 )

x_train_shorten = np.array( x_train_shorten )
y_train_shorten = np.array( y_train_shorten )

In [0]:
source = 10
target = 250
x_train_unlabelled = data_0[source:target] + data_1[source:target] + data_2[source:target] + data_3[source:target] + data_4[source:target] + data_5[source:target] + data_6[source:target] + data_7[source:target] + data_8[source:target] + data_9[source:target]
y_train_unlabelled = list(np.zeros(target - source)) + list(np.ones(target - source)) + list( np.ones(target - source) * 2 ) + list( np.ones(target - source) * 3 ) + list( np.ones(target - source) * 4 ) + list( np.ones(target - source) * 5 ) + list( np.ones(target - source) * 6 ) + list( np.ones(target - source) * 7 ) + list( np.ones(target - source) * 8 ) + list( np.ones(target - source) * 9 )

x_train_unlabelled = np.array( x_train_unlabelled )
y_train_unlabelled = np.array( y_train_unlabelled )

In [0]:
source = 250
target = 450
x_eval_shorten = data_0[source:target] + data_1[source:target] + data_2[source:target] + data_3[source:target] + data_4[source:target] + data_5[source:target] + data_6[source:target] + data_7[source:target] + data_8[source:target] + data_9[source:target]
y_eval_shorten = list(np.zeros(target - source)) + list(np.ones(target - source)) + list( np.ones(target - source) * 2 ) + list( np.ones(target - source) * 3 ) + list( np.ones(target - source) * 4 ) + list( np.ones(target - source) * 5 ) + list( np.ones(target - source) * 6 ) + list( np.ones(target - source) * 7 ) + list( np.ones(target - source) * 8 ) + list( np.ones(target - source) * 9 )

x_eval_shorten = np.array( x_eval_shorten )
y_eval_shorten = np.array( y_eval_shorten )

In [0]:
classifier.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# **Define the datasets**

In [0]:
batch_size = 100

train_dataset = tf.data.Dataset.from_tensor_slices((x_train_shorten,y_train_shorten))
train_dataset = train_dataset.shuffle(1000).batch(batch_size)

pseudo_dataset = tf.data.Dataset.from_tensor_slices((x_train_unlabelled))
pseudo_dataset = pseudo_dataset.shuffle(1000).batch(batch_size)

eval_dataset = tf.data.Dataset.from_tensor_slices((x_eval_shorten,y_eval_shorten))
eval_dataset = eval_dataset.shuffle(1000).batch(batch_size)

# **Pre-train the Model with labelled data**
Firstly we pre-train the model here with 30 epochs on only labelled data, and achieving a maximum accuracy of around 70 percent.

In [0]:
classifier.fit(train_dataset, validation_data=eval_dataset, batch_size=100, epochs=100)

# **Augmentation of Unlabelled Images**
Two sets of augmentations gets applied to unlabelled images, one weak and one strong one. The weak ones will be used for identifying the pesudo labels by predicting the labels through the pre-trained classifier on labelled data and the strong augmented ones will be used as images used for training the classifier.

In [0]:
from imgaug import augmenters as iaa

In [0]:
weak_aug = iaa.Sequential([
    iaa.Sometimes(1.0, iaa.GaussianBlur((0.5, 0.6))),
    iaa.Sometimes(0.8, iaa.Affine(rotate=(-5, 5)))
],
random_order=True
)

strong_aug = iaa.Sequential([
    iaa.Dropout((0.01, 0.1), per_channel=0.5),
    iaa.Sometimes(0.8, iaa.Affine(rotate=(-25, 25))), # rotate 50% of the images
    iaa.Sometimes(0.5, iaa.Affine(
        scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
        translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)},
        rotate=(-25, 25),
        shear=(12, 15)
    ))
],
random_order=True 
)
x_train_unlabelled_weak = []
x_train_unlabelled_strong = []

for i in range(len(x_train_unlabelled)):
  x_train_unlabelled_weak.append( weak_aug.augment_image(x_train_unlabelled[i]) )
  x_train_unlabelled_strong.append( strong_aug.augment_image(x_train_unlabelled[i]) )

In [0]:
x_train_unlabelled_weak = np.array(x_train_unlabelled_weak)
x_train_unlabelled_strong = np.array(x_train_unlabelled_strong)

In [0]:
fig=plt.figure(figsize=(8, 8))
columns = 4
rows = 5
indicies = np.random.randint(0, len(x_train_unlabelled_strong), 21)    
for i in range(1, columns*rows +1):    
    fig.add_subplot(rows, columns, i)
    plt.imshow(x_train_unlabelled_weak[indicies[i]].reshape(28,28))
plt.show()

In [0]:
fig=plt.figure(figsize=(8, 8))
columns = 4
rows = 5
for i in range(1, columns*rows +1):    
    fig.add_subplot(rows, columns, i)
    plt.imshow(x_train_unlabelled_strong[indicies[i]].reshape(28,28))
plt.show()

## **Train the model with unlabelled dataset**
Train the model with unlabelled dataset and fine tune it by training the model for one epoch on labelled dataset. 

In [0]:
avg_main_loss = tf.keras.metrics.Mean(name='avg_main_loss', dtype=tf.float32)
pseudo_steps = int(x_train_unlabelled.shape[0] / batch_size )
eval_steps = int(x_eval_shorten.shape[0] / batch_size )

epoch = 30

classifier_optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

x_values = np.array([])
threshold = 0.90

for epoch_idx in range( epoch ):

  print( "Epoch {}/{}".format( epoch_idx + 1, epoch ) )
  classifier.evaluate(eval_dataset)

  for _ in range(20):

    pseudo_labels = classifier.predict( x_train_unlabelled_weak ) 

    # only accept psuedo labels with certain threshold accuracy in their probability   
    pseudo_labels = np.array( [ np.argmax(x) if np.max(x) > threshold else -1 for x in pseudo_labels])
    dataset = np.array( [ [x,y] for x,y in zip( x_train_unlabelled_strong, pseudo_labels ) if y != -1 ])
    x_values = np.array( [x for x,y in dataset ])
    y_values = np.array( [y for x,y in dataset ])

    pseudo_dataset = tf.data.Dataset.from_tensor_slices((x_values, y_values ))
    pseudo_dataset = pseudo_dataset.shuffle(1000).batch(batch_size)


    classifier.fit(pseudo_dataset, batch_size=100, epochs=1, verbose=0)
    classifier.fit(train_dataset, batch_size=100, epochs=1, verbose=0)
