In [1]:
!nvidia-smi

Wed Dec 14 01:53:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    30W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Knowledge Distillation

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers as L
import time
from random import seed
from random import randint

# Data Preparation

In [3]:
(x_train, y_train), (x_valid, y_valid) = keras.datasets.cifar10.load_data()
x_train = x_train/255.0
x_valid = x_valid/255.0
# x_train = np.expand_dims(x_train, axis=3)
# x_valid = np.expand_dims(x_valid, axis=3)
y_train = keras.utils.to_categorical(y_train)
y_valid = keras.utils.to_categorical(y_valid)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [4]:
T_EPOCHS = 25
S_EPOCHS = 20
IMAGE_SIZE = x_train.shape[1:]
BATCH_SIZE = 512
N_CLASSES = y_train.shape[-1]
IMAGE_SIZE, N_CLASSES

((32, 32, 3), 10)

In [5]:
def nn_callbacks():
    es = keras.callbacks.EarlyStopping(
        patience=5, verbose=1, restore_best_weights=True, min_delta=1e-4
    )
    rlp = keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)
    return [es, rlp]

In [6]:
d_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
d_valid = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))

del x_train, x_valid, y_train, y_valid

# Building the Models

**Teacher Model**

In [7]:
def build_teacher_model(name='teacher'):
    base_model = keras.applications.VGG19(input_shape=IMAGE_SIZE, include_top=False)
    base_model.trainable = True
    return keras.models.Sequential([
            base_model,        
            L.GlobalAvgPool2D(),        
            L.Dense(N_CLASSES)
        ], name=name
    )
        

teacher_model = build_teacher_model()
teacher_model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "teacher"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vgg19 (Functional)          (None, 1, 1, 512)         20024384  
                                                                 
 global_average_pooling2d (G  (None, 512)              0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 10)                5130      
                                                                 
Total params: 20,029,514
Trainable params: 20,029,514
Non-trainable params: 0
_________________________________________________________________


**Student Model**

In [8]:
def build_student_model(name='student'):
    return keras.models.Sequential([
        L.Conv2D(64, 3, input_shape=IMAGE_SIZE, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.MaxPool2D(pool_size=2),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.MaxPool2D(pool_size=2),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.MaxPool2D(pool_size=2),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.MaxPool2D(pool_size=2),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.Conv2D(64, 3, padding='same', activation='relu'),
        L.MaxPool2D(pool_size=2),
        L.GlobalAvgPool2D(),
        L.Dense(N_CLASSES),
    ],name=name) 

student_model = build_student_model()
student2_model = build_student_model()
student4_model = build_student_model()
student_model.summary()

Model: "student"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 32, 32, 64)        1792      
                                                                 
 conv2d_1 (Conv2D)           (None, 32, 32, 64)        36928     
                                                                 
 conv2d_2 (Conv2D)           (None, 32, 32, 64)        36928     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 16, 64)       0         
 )                                                               
                                                                 
 conv2d_3 (Conv2D)           (None, 16, 16, 64)        36928     
                                                                 
 conv2d_4 (Conv2D)           (None, 16, 16, 64)        36928     
                                                           

# Training Teacher

In [9]:
teacher_model.compile(
    optimizer=keras.optimizers.Adam(1e-5), 
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

history = teacher_model.fit(
    d_train.shuffle(1024, 19).batch(BATCH_SIZE),
    validation_data=d_valid.shuffle(1024, 19).batch(BATCH_SIZE),
    epochs=T_EPOCHS,
    callbacks=nn_callbacks(), 
    batch_size=BATCH_SIZE
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 13: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 16: ReduceLROnPlateau reducing learning rate to 9.999999974752428e-08.
Epoch 17/25
Epoch 18/25
Epoch 18: ReduceLROnPlateau reducing learning rate to 1.0000000116860975e-08.
Epoch 19/25
Epoch 19: early stopping


# Before Distill

In [10]:
distiller_Comp = False
distiller2_Comp = False
distiller4_Comp = False
distiller4R_Comp = False

# Distillation in Action

In [11]:
class Distiller(keras.Model):
    def __init__(self, student, teacher, activation):
        super().__init__()
        self.teacher = teacher
        self.student = student
        self.activation = activation

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=10,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student.compile(optimizer=optimizer, metrics=metrics, loss=student_loss_fn)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        x, y = data
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            student_predictions = self.student(x, training=True)
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                self.activation(teacher_predictions / self.temperature, axis=1),
                self.activation(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results

    def test_step(self, data):
        x, y = data
        teacher_predictions = self.teacher(x, training=False)
        student_predictions = self.student(x, training=False)
        
        student_loss = self.student_loss_fn(y, student_predictions)
        distillation_loss = self.distillation_loss_fn(
            self.activation(teacher_predictions / self.temperature, axis=1),
            self.activation(student_predictions / self.temperature, axis=1),
        )
        loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
        
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results
    
    def call(self, x):
        return self.student(x)

In [12]:

distiller = Distiller(student_model, teacher_model, tf.nn.softmax)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.7,
    temperature=100,
)
history_distillation = distiller.fit(
    d_train.shuffle(1024, 19).batch(BATCH_SIZE), 
    validation_data=d_valid.shuffle(1024, 19).batch(BATCH_SIZE),
    epochs=S_EPOCHS, callbacks=nn_callbacks(), batch_size=BATCH_SIZE
)
distiller_Comp = True

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 16: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 17/20
Epoch 18/20
Epoch 18: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 19/20
Epoch 19: early stopping


# Distillation with 2 teachers


In [13]:
class Distiller2(keras.Model):
    def __init__(self, student, teacher, teacher2,activation):
        super().__init__()
        self.teacher = teacher
        self.teacher2 = teacher2
        self.student = student
        self.activation = activation

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=10,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student.compile(optimizer=optimizer, metrics=metrics, loss=student_loss_fn)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        x, y = data
        teacher_predictions = self.teacher(x, training=False)
        teacher2_predictions = self.teacher2(x, training=False)


        teacher_promedio = teacher_predictions*0.5 + teacher2_predictions*0.5

        with tf.GradientTape() as tape:
            student_predictions = self.student(x, training=True)
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                self.activation(teacher_promedio / self.temperature, axis=1),
                self.activation(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results

    def test_step(self, data):
        x, y = data
        teacher_predictions = self.teacher(x, training=False)
        teacher2_predictions = self.teacher2(x, training=False)

        student_predictions = self.student(x, training=False)
        
        teacher_promedio = teacher_predictions*0.5 + teacher2_predictions*0.5
        student_loss = self.student_loss_fn(y, student_predictions)
        distillation_loss = self.distillation_loss_fn(
            self.activation(teacher_promedio / self.temperature, axis=1),
            self.activation(student_predictions / self.temperature, axis=1),
        )
        loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
        
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results
    
    def call(self, x):
        return self.student(x)

In [14]:
student2_model = build_student_model()
distiller2 = Distiller2(student2_model, teacher_model, teacher_model, tf.nn.softmax)
distiller2.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.7,
    temperature=100,
)
history_distillation = distiller2.fit(
    d_train.shuffle(1024, 19).batch(BATCH_SIZE), 
    validation_data=d_valid.shuffle(1024, 19).batch(BATCH_SIZE),
    epochs=S_EPOCHS, callbacks=nn_callbacks(), batch_size=BATCH_SIZE
)
distiller2_Comp = True

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 16: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 17/20
Epoch 18/20
Epoch 18: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 19/20
Epoch 19: early stopping


# Distiller with 4 teachers

In [15]:
class Distiller4(keras.Model):
    def __init__(self, student, teacher, teacher2,teacher3,teacher4,activation):
        super().__init__()
        self.teacher = teacher
        self.teacher2 = teacher2
        self.teacher3 = teacher3
        self.teacher4 = teacher4
        self.student = student
        self.activation = activation

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=10,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student.compile(optimizer=optimizer, metrics=metrics, loss=student_loss_fn)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        x, y = data
        teacher_predictions = self.teacher(x, training=False)
        teacher2_predictions = self.teacher2(x, training=False)
        teacher3_predictions = self.teacher2(x, training=False)
        teacher4_predictions = self.teacher2(x, training=False)


        teacher_promedio = teacher_predictions*0.25 + teacher2_predictions*0.25 + teacher3_predictions*0.25 + teacher4_predictions*0.25

        with tf.GradientTape() as tape:
            student_predictions = self.student(x, training=True)
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                self.activation(teacher_promedio / self.temperature, axis=1),
                self.activation(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results

    def test_step(self, data):
        x, y = data

        student_predictions = self.student(x, training=False)
        
        teacher_predictions = self.teacher(x, training=False)
        teacher2_predictions = self.teacher2(x, training=False)
        teacher3_predictions = self.teacher2(x, training=False)
        teacher4_predictions = self.teacher2(x, training=False)


        teacher_promedio = teacher_predictions*0.25 + teacher2_predictions*0.25 + teacher3_predictions*0.25 + teacher4_predictions*0.25
        student_loss = self.student_loss_fn(y, student_predictions)
        distillation_loss = self.distillation_loss_fn(
            self.activation(teacher_promedio / self.temperature, axis=1),
            self.activation(student_predictions / self.temperature, axis=1),
        )
        loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
        
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results
    
    def call(self, x):
        return self.student(x)

In [16]:
student4_model = build_student_model()
distiller4 = Distiller4(student4_model, teacher_model, teacher_model,teacher_model,teacher_model, tf.nn.softmax)
distiller4.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.7,
    temperature=100,
)
history_distillation = distiller4.fit(
    d_train.shuffle(1024, 19).batch(BATCH_SIZE), 
    validation_data=d_valid.shuffle(1024, 19).batch(BATCH_SIZE),
    epochs=S_EPOCHS, callbacks=nn_callbacks(), batch_size=BATCH_SIZE
)
distiller4_Comp = True

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 15: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 16/20
Epoch 17/20
Epoch 17: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 18/20
Epoch 18: early stopping


# Distiller with 4 teachers random

In [17]:
class Distiller4R(keras.Model):
    def __init__(self, student, teacher, teacher2,teacher3,teacher4, activation):
        super().__init__()
        self.teacher = teacher
        self.teacher2 = teacher2
        self.teacher3 = teacher3
        self.teacher4 = teacher4
        self.student = student
        self.activation = activation
        self.value = randint(0, 1000)%4

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=10,
        
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student.compile(optimizer=optimizer, metrics=metrics, loss=student_loss_fn)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        x, y = data
        if (self.value==0):
          teacher_predictions = self.teacher(x, training=False)
        elif(self.value==1):
          teacher_predictions = self.teacher2(x, training=False)
        elif(self.value==2):
          teacher_predictions = self.teacher3(x, training=False)
        else:
          teacher_predictions = self.teacher4(x, training=False)


        with tf.GradientTape() as tape:
            student_predictions = self.student(x, training=True)
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                self.activation(teacher_predictions / self.temperature, axis=1),
                self.activation(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results

    def test_step(self, data):
        self.value = randint(0, 1000)%4
        x, y = data
        if(self.value ==0):
          teacher_predictions = self.teacher(x, training=False)
        elif(self.value ==1):
          teacher_predictions = self.teacher2(x, training=False)
        elif(self.value ==2):
          teacher_predictions = self.teacher3(x, training=False)
        else:
          teacher_predictions = self.teacher4(x, training=False)   

        student_predictions = self.student(x, training=False)
        student_loss = self.student_loss_fn(y, student_predictions)
        distillation_loss = self.distillation_loss_fn(
            self.activation(teacher_predictions / self.temperature, axis=1),
            self.activation(student_predictions / self.temperature, axis=1),
        )
        loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
        
        self.compiled_metrics.update_state(y, student_predictions)

        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss, "loss": loss}
        )
        return results
    
    def call(self, x):
        return self.student(x)

In [18]:
student4R_model = build_student_model()
distiller4R = Distiller4R(student4R_model, teacher_model, teacher_model,teacher_model,teacher_model, tf.nn.softmax)


In [19]:
random_epoch=20
for i in range(random_epoch):
    print("Epoch " +str(i+1) +"/"+str(random_epoch))
    distiller4R.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.7,
    temperature=100,
    )
    if (i!=0):
      student4R_model.set_weights(weights)
    history_distillation = distiller4R.fit(
    d_train.shuffle(1024, 19).batch(BATCH_SIZE), 
    validation_data=d_valid.shuffle(1024, 19).batch(BATCH_SIZE),
    epochs=1, callbacks=[nn_callbacks()], batch_size=BATCH_SIZE)
    weights = student4R_model.get_weights()
distiller4R_Comp= True


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Comparison

In [20]:
import os

print('Teacher Model:')
teacher_model.save('teacher.h5')
teacher_model.evaluate(d_valid.shuffle(1024, 19).batch(BATCH_SIZE))
print("File Size is :", round(os.path.getsize('teacher.h5')/1024**2, 2), "MB")

if(distiller_Comp):
  print('Distilled Model:')
  student_model.save('student.h5')
  student_model.evaluate(d_valid.shuffle(1024, 19).batch(BATCH_SIZE))
  print("File Size is :", round(os.path.getsize('student.h5')/1024**2, 2), "MB")

if(distiller2_Comp):
  print('Distilled Model 2:')
  student2_model.save('student2.h5')
  student2_model.evaluate(d_valid.shuffle(1024, 19).batch(BATCH_SIZE))
  print("File Size is :", round(os.path.getsize('student2.h5')/1024**2, 2), "MB")

if(distiller4_Comp):
  print('Distilled Model 4:')
  student4_model.save('student4.h5')
  student4_model.evaluate(d_valid.shuffle(1024, 19).batch(BATCH_SIZE))
  print("File Size is :", round(os.path.getsize('student4.h5')/1024**2, 2), "MB")

if(distiller4R_Comp):
  print('Distilled Model 4R:')
  student4R_model.save('student4R.h5')
  student4R_model.evaluate(d_valid.shuffle(1024, 19).batch(BATCH_SIZE))
  print("File Size is :", round(os.path.getsize('student4R.h5')/1024**2, 2), "MB")

Teacher Model:
File Size is : 229.35 MB
Distilled Model:
File Size is : 6.09 MB
Distilled Model 2:
File Size is : 6.09 MB
Distilled Model 4:
File Size is : 6.09 MB
Distilled Model 4R:
File Size is : 6.09 MB


**Reference**

* [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531)
* [Implementation of classical Knowledge Distillation](https://keras.io/examples/vision/knowledge_distillation/)