**following** https://medium.com/analytics-vidhya/knowledge-distillation-in-a-deep-neural-network-c9dd59aff89b

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from machine_learning.distillers.student_teacher import init_student, init_teacher, StudentTeacher


### **data**

In [2]:
def load_data():
    (X_train, Y_train), (X_test, Y_test) = tf.keras.datasets.cifar10.load_data()
    X_train = X_train.astype("float32") / 255.0
    X_train = np.reshape(X_train, (-1, 32, 32, 3))
    X_test = X_test.astype("float32") / 255.0
    X_test = np.reshape(X_test, (-1, 32, 32,3))
    return X_train, Y_train, X_test, Y_test

In [11]:
X_train, Y_train, X_test, Y_test = load_data()

### **first train a larger teacher**

In [5]:
teacher = init_teacher()
teacher.summary()

Model: "teacher"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 16, 16, 256)       7168      
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 16, 16, 256)       0         
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 16, 16, 256)      0         
 2D)                                                             
                                                                 
 conv2d_3 (Conv2D)           (None, 8, 8, 512)         1180160   
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 8, 8, 512)         0         
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 8, 8, 512)        0         
 2D)                                                       

In [6]:
teacher.compile(optimizer=tf.keras.optimizers.Adam(), 
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [12]:
epochs = 5
teacher.fit(X_train, Y_train, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[1.061924934387207, 0.6858999729156494]

In [13]:
teacher.evaluate(X_test, Y_test)



[1.061924934387207, 0.6858999729156494]

### **then train a smaller student that will learn from teacher and perform better**

In [19]:
student = init_student()
student.summary()

Model: "student"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 16, 16, 64)        1792      
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 16, 16, 64)        0         
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 16, 16, 64)       0         
 2D)                                                             
                                                                 
 conv2d_5 (Conv2D)           (None, 8, 8, 256)         147712    
                                                                 
 leaky_re_lu_5 (LeakyReLU)   (None, 8, 8, 256)         0         
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 8, 8, 256)        0         
 2D)                                                       

In [15]:
student_loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
distill_loss_fn = keras.losses.KLDivergence()
alpha = 0.3
temp = 7

model = StudentTeacher(student, teacher, student_loss_fn, distill_loss_fn, alpha, temp)

In [16]:
model.compile(tf.keras.optimizers.Adam(), metrics=[keras.metrics.SparseCategoricalAccuracy()])

In [17]:
epochs=5
model.fit(X_train, Y_train, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb534ec3950>

In [18]:
model.evaluate(X_test, Y_test)



[0.7192000150680542, 0.7057998180389404]