# Introduction
This code tests our work on Breast Cancer Wisconsin (Diagnostic) Data Set (WDBCD) (https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data)

In [None]:
!pip install tensorflow==2.6.4

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

#import os
#print(os.listdir("../input"))
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

%matplotlib inline 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv1D, MaxPool1D,Flatten,Dense,Dropout,BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import utils



#Import models from scikit learn module:
from sklearn import datasets,metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

#Measuring Time
import time


In [None]:
print(tf.__version__)

# Parameters

In [None]:
epoch=50

# Loading Dataset

In [None]:
#load dataset
data = pd.read_csv("/kaggle/input/wdbc-kd-data/data.csv",header = 0)
print(type(data))
data.head(5)

# Data Preprocessing
## Get classes

In [None]:
Y=data.diagnosis

print(Y.value_counts())
Y.head(5)

In [None]:
plt.title('Count of cancer type')
sns.countplot(x="diagnosis",data=data)
plt.ylabel('Count')
plt.show()

## Prepare and clean the dataset

In [None]:
data.isnull().any().describe()

In [None]:
data.info()

## Remove unwanted info

In [None]:
# drop id and unnamed 32 colums from the features
# we still need diagnosis for further statistics, 
# it will be dropped later

data.drop(['id','Unnamed: 32'],axis=1,inplace=True)
data.info(),

In [None]:
data.describe()

## Analyse usable data

In [None]:
for i in (data.columns[1:6]):
    plt.subplot(1,2,1)
    data[i][data['diagnosis']=='B'].plot.hist(alpha=0.5,title=i,color='green')
    data[i][data['diagnosis']=='M'].plot.hist(alpha=0.5,color='red')
    plt.legend(['B','M'],loc='upper right')
    #plt.grid(visible=True)
    
    
    plt.subplot(1,2,2)
    sns.boxplot(x="diagnosis", y=i, data=data)
    plt.show()

## Observations*
1. Mean values of cell like radius, perimeter, area, compactness, concavity,concave points etc can be used in classification of the cancer. Larger values of these parameters tends to show a correlation with malignant tumors.
2. Mean values of column like texture, smoothness, symmetry or fractual dimension does not show a particular preference of one diagnosis over the other. In any of the histograms there are no noticeable large outliers that warrants further cleanup.

## Remove input from data

In [None]:
#Diagnosis is in index 0, 
#so we start copying from index 1
#see, I told you we will drop it ;-)

data.drop('diagnosis',axis=1,inplace=True)
data.head(5)

## Correlation matrix

In [None]:
# Create correlation matrix
data.corr()

In [None]:
correlation=data.corr()

# Getting the Upper Triangle of the co-relation matrix
matrix = np.triu(correlation)
plt.figure(figsize=(40,16))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='copper',mask=matrix)
plt.title('Correlation between different fearures')
#plt.savefig("cor.svg")

In [None]:
type(data)

## Get the input (X)

In [None]:
#Copying data to X for bravity
X=data

#for emulating user
temp=pd.DataFrame(data=[data.values.tolist()[0]],columns=data.columns.values.tolist())
del data

#Verify :-D
print(type(X))
X.head(5)

## PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit_transform(X)

In [None]:
pca.get_covariance()

In [None]:
explained_variance=pca.explained_variance_ratio_
explained_variance

In [None]:
# We used logarithmic scale in the y-axis becaus the first value is so high

with plt.style.context('dark_background'):
    fig=plt.figure(figsize=(10, 6))
    ax = fig.add_subplot()

    ax.bar(range(30), explained_variance, alpha=0.5, align='center',
            label='individual explained variance')
    ax.set_yscale('log')
    
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.grid(visible=True)
    plt.tight_layout()

Thus we can see from the above plot that first three components constitute almost 73% of the variance. Fourth to twenty sixth components has 25% of the data sprad.The remaining component has less than 0.001% of the variance.Hence we can drop from 27 to 30th component(total of 4 features)



In [None]:
pca=PCA(n_components=26)
X_new=pca.fit_transform(X)
X_new

In [None]:
pca.get_covariance()

In [None]:
explained_variance=pca.explained_variance_ratio_
explained_variance

In [None]:
# We used logarithmic scale in the y-axis becaus the first value is so high

with plt.style.context('dark_background'):
    fig=plt.figure(figsize=(20, 12))
    ax = fig.add_subplot()

    ax.bar(range(26), explained_variance, alpha=0.5, align='center',
            label='individual explained variance')
    ax.set_yscale('log')
    
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.grid(visible=True)
    plt.tight_layout()

In [None]:
X=X_new
(X.shape, Y.shape)

## Convert to one-hot

In [None]:
#Check befor conversion
print("Converting to categorical")
print("Before Conversion:")
print(Y.shape)
print(type(Y))


#Convert to Categorical values
Y = Y.map({'B':0,'M':1})
Y = utils.to_categorical(Y, num_classes=2)


#verify shape
print("After Conversion:")
print(Y.shape)
type(Y)

## Splitting data into training and testing

In [None]:
(X.shape, Y.shape)

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,Y,test_size=0.15,stratify=Y)

In [None]:
y_test.shape

## Scaling

In [None]:
scaler=StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(X_train.shape, X_test)

## Reshaping

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.shape[1]

# Create teacher and student model

In [None]:
# Create the teacher
teacher =keras.Sequential()
teacher.add(keras.Input(shape=(X_train.shape[1],1)))
teacher.add(keras.layers.Conv1D(filters=64,kernel_size= 2,activation='relu'))
teacher.add(keras.layers.BatchNormalization())
teacher.add(keras.layers.Dropout(0.2))
        
teacher.add(keras.layers.Conv1D(448, 2,activation='relu'))
teacher.add(keras.layers.BatchNormalization())
teacher.add(keras.layers.Dropout(0.2))
        
teacher.add(keras.layers.Flatten())
teacher.add(keras.layers.Dense(64,activation='relu'))
teacher.add(keras.layers.Dropout(0.2))

teacher.add(keras.layers.Dense(2,activation='sigmoid'))

teacher.compile(optimizer=Adam(learning_rate=0.0001),loss='binary_crossentropy',
              metrics=['accuracy'])


# Create the student
student = keras.Sequential(
    [
        keras.Input(shape=(X_train.shape[1], 1)),
        layers.Conv1D(4, 2),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
       # layers.LeakyReLU(alpha=0.2),
        
        layers.Conv1D(8, 2),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        layers.Flatten(),
        layers.Dense(2,activation='sigmoid'),
    ],
    name="student",)
    

# Clone student for later comparison
student_scratch = keras.models.clone_model(student)

#compile STUDENT
student_scratch.compile(optimizer=Adam(learning_rate=0.0001),loss='binary_crossentropy',
              metrics=['accuracy'])

print( sep='\n')
print('STUDENT')

In [None]:
teacher.summary()

In [None]:
student.summary()

## Teacher Model training

In [None]:
start_time = time.time()
teacher_history=teacher.fit(X_train,y_train,epochs=epoch,validation_data=(X_test,y_test),verbose=1)
end_time = time.time()
print( sep='\n')

## Teacher Evaluation

In [None]:
teacher_loss, teacher_acc = teacher.evaluate(X_test, y_test)

## Save Teacher File as Keras

In [None]:
teacher.save("teacher_model.keras") 

## Save Teacher File as TFLite

In [None]:
# Export model
teacher.export("teacher") 

# Convert to a TFLite model
converter = tf.lite.TFLiteConverter.from_saved_model("teacher")
my_teacher_model = converter.convert()

with open("teacher.tflite", "wb") as f:
    f.write(my_teacher_model)

## Student Model without KD Training

In [None]:
start_time = time.time()
student_scratch_history=student_scratch.fit(X_train,y_train,epochs=epoch,validation_data=(X_test,y_test),verbose=1)
end_time = time.time()
print( sep='\n')

## Evaluate Student Scratch

In [None]:
student_scratch_loss, student_scratch_acc = student_scratch.evaluate(X_test, y_test)

## Save Student Scratch File as Keras

In [None]:
student_scratch.save("student_scratch_model.keras") 

## Save Student Scratch File as TFLite

In [None]:
# Export model
student_scratch.export("student_scratch") 

# Convert to a TFLite model
converter = tf.lite.TFLiteConverter.from_saved_model("student_scratch")
my_student_scratch_model = converter.convert()

with open("student_scratch.tflite", "wb") as f:
    f.write(my_student_scratch_model)

# Knowledge Distillation
## Construct distiller class

In [None]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = self.distillation_loss_fn(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions / self.temperature, axis=1),
            )
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [None]:
 # Initialize and compile distiller
distiller = Distiller(student=student, teacher=teacher)
distiller.compile(optimizer=keras.optimizers.Adam(),
metrics=['accuracy'],
student_loss_fn= keras.losses.BinaryCrossentropy() ,
distillation_loss_fn=keras.losses.KLDivergence(),
alpha=0.1,
temperature=10,
)

## Distill Teacher to Student

In [None]:
print( sep='\n')
print('DISTILL TEACHER TO STUDENT')
start_time = time.time()
kd_history=distiller.fit(X_train, y_train, epochs=epoch, verbose=1, validation_data=(X_test,y_test))
end_time = time.time()
print( sep='\n')

## Evaluate Distilled Student

In [None]:
distiller_evaluate_output = distiller.evaluate(X_test, y_test)

#Sample output:
#[<tf.Tensor: shape=(), dtype=float32, numpy=0.49883049726486206>,
# {'accuracy': <tf.Tensor: shape=(), dtype=float32, numpy=0.9069767594337463>},
# <tf.Tensor: shape=(), dtype=float32, numpy=0.15347284078598022>]

# The first element in the list is the student loss.
# We use .numpy() to get the numerical value.
distiller_student_loss = distiller_evaluate_output[0].numpy()

# The second element is a dictionary. We access the 'accuracy' key
# and then get its numpy value.
distiller_accuracy = distiller_evaluate_output[1]['accuracy'].numpy()

# The third element is the total combined loss.
distiller_total_loss = distiller_evaluate_output[2].numpy()

## Save Distilled Student File as Keras

In [None]:
distiller.student.save("distilled_model.keras") 

## Save Distilled Student File as TFLite

In [None]:
# Export model
distiller.student.export("distiller") 

# Convert to a TFLite model
converter = tf.lite.TFLiteConverter.from_saved_model("distiller")
my_distiller_model = converter.convert()

with open("distiller.tflite", "wb") as f:
    f.write(my_distiller_model)

 # Plots of accuracy and loss 

## File Sizes

In [None]:
import os

original_size = os.path.getsize("teacher_model.keras") / 1024
original_tflite_size = os.path.getsize("teacher.tflite") / 1024

student_scratch_size = os.path.getsize("student_scratch_model.keras") / 1024
student_scratch_tflite_size = os.path.getsize("student_scratch.tflite") / 1024

distiller_size = os.path.getsize("distilled_model.keras") / 1024
distiller_tflite_size = os.path.getsize("distiller.tflite") / 1024



print(f"Keras Original size: \t\t{original_size:.2f} KB")
print(f"TFLite Original size: \t\t{original_tflite_size:.2f} KB")

print(f"Keras Student Scratch size: \t{student_scratch_size:.2f} KB")
print(f"TFLite Student Scratch size: \t{student_scratch_tflite_size:.2f} KB")

print(f"Keras Distilled size: \t\t{distiller_size:.2f} KB")
print(f"TFLite Distilled size: \t\t{distiller_tflite_size:.2f} KB")

## Evaluating TFLite

In [None]:
def evaluate_tflite_model(tflite_path, X_test, y_test):
    import numpy as np
    import tensorflow as tf

    interpreter = tf.lite.Interpreter(model_path=tflite_path)
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    input_index = input_details[0]['index']
    output_index = output_details[0]['index']

    correct = 0
    y_pred=[]
    y_true=[]
    for i in range(len(X_test)):
        input_data = np.expand_dims(X_test[i], axis=0).astype(np.float32)

        interpreter.set_tensor(input_index, input_data)
        interpreter.invoke()
        output = interpreter.get_tensor(output_index)  # shape (1, 2)

        predicted = np.argmax(output[0])       # pick class with highest sigmoid score
        actual = np.argmax(y_test[i])          # one-hot to label
        y_pred.append(predicted)
        y_true.append(actual)

        if predicted == actual:
            correct += 1

    accuracy = correct / len(X_test)
    print(f"TFLite model accuracy: {accuracy * 100:.2f}%")
    return accuracy, y_true, y_pred

In [None]:
teacher_tflite_acc, teacher_tflite_y_true, teacher_tflite_y_pred= evaluate_tflite_model("teacher.tflite", X_test, y_test)

In [None]:
student_scratch_tflite_acc, student_scratch_y_true, student_scratch_y_pred= evaluate_tflite_model("student_scratch.tflite", X_test, y_test)

In [None]:
distiller_tflite_acc, distiller_tflite_y_true, distiller_tflite_y_pred= evaluate_tflite_model("distiller.tflite", X_test, y_test)

## Accuracy and loss of teacher model

In [None]:
def plotLearningCurve(history,epochs):
  epochRange = range(1,epochs+1)
  plt.plot(epochRange,history.history['accuracy'])
  plt.plot(epochRange,history.history['val_accuracy'])
  plt.title('Model Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend(['Train','Validation'],loc='lower right')
  plt.grid(visible=True)
  plt.show()

  plt.plot(epochRange,history.history['loss'])
  plt.plot(epochRange,history.history['val_loss'])
  plt.title('Model Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend(['Train','Validation'],loc='upper right')
  plt.grid(visible=True)
  plt.show()

In [None]:
plotLearningCurve(teacher_history,epoch)

## Students functions

In [None]:
def plotKDCurveD(history,epochs):
  epochRange = range(1,epochs+1)

  #dict_keys(['accuracy', 'distillation_loss', 'loss', 'student_loss', 'val_loss', 'val_student_loss'])
  plt.plot(epochRange,history.history['accuracy'])
  plt.title('Model Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend(['Train'],loc='lower right')
  plt.grid(visible=True)
  plt.show()

  plt.plot(epochRange,history.history['distillation_loss'])  
  plt.plot(epochRange,history.history['loss'])
  plt.plot(epochRange,history.history['val_loss'])
  plt.plot(epochRange,history.history['student_loss'])
  plt.plot(epochRange,history.history['val_student_loss'])
  plt.title('Model Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend(['distillation_loss','loss','val_loss','student_loss','val_student_loss'],loc='upper right')
  plt.grid(visible=True)
  plt.show()

## Plotting Student scratch Model( without KD)

In [None]:
plotLearningCurve(student_scratch_history,epoch)

## Student knowledge distilled accuracy and loss

In [None]:
plotKDCurveD(kd_history,epoch)

## Function plotting two models

In [None]:
def plotKDCurveProf(his,tHis,epochs):
  epochRange = range(1,epochs+1)
  plt.plot(epochRange,tHis.history['accuracy'])
  plt.plot(epochRange,tHis.history['val_accuracy'])
  plt.plot(epochRange,his.history['accuracy'])
  #plt.plot(epochRange,his.history['val_accuracy']) #KD has no val_accuracy

  plt.title('Model Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend(['T_train','T_val','KD_train','KD_val'],loc='lower right')
  plt.grid(visible=True)
  plt.show()


  plt.plot(epochRange,tHis.history['loss'])
  plt.plot(epochRange,tHis.history['val_loss'])
  plt.plot(epochRange,his.history['student_loss'])
  plt.plot(epochRange,his.history['val_student_loss'])
  plt.title('Model Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend(['T_train','T_val','KD_train','KD_val'],loc='upper right')
  plt.grid(visible=True)
  plt.show()

## Student with KD Vs Teacher Accuracy and loss

In [None]:
plotKDCurveProf(kd_history,teacher_history,epoch)

## Student with KD Vs Student without Kd Accuracy and loss

In [None]:
plotKDCurveProf(kd_history,student_scratch_history,epoch)

## Weights and Parameters

In [None]:
#==========================================================================
#Function for counting weights that were zeroed
#==========================================================================
def count_zeros(model):
    total = 0
    zeros = 0
    for layer in model.layers:
        weights = layer.get_weights()
        for w in weights:
            total += w.size
            zeros += np.sum(w == 0)
    print(f"Total weights: {total}")
    print(f"Zero weights: {zeros}")
    print(f"Sparsity: {100 * zeros / total:.2f}%")
    return total, zeros
    
#Print the performance
print("Teacher model:")
teacher_param_total, teacher_param_zeros = count_zeros(teacher)

#Print the performance
print("\n\nUndistilled Student:")
student_scratch_param_total, student_scratch_param_zeros = count_zeros(student_scratch)

#Print the performance
print("\n\nDistilled Student & stripped model:")
distiller_std_param_total, distiller_std_param_zeros = count_zeros(distiller.student)
distiller_param_total, distiller_param_zeros = count_zeros(distiller)

# Result Summary

In [None]:
print("Vals. \t\t\t TM \t\t SM_scratch \t SM")
print("="*65)
print("Acc. (%%) \t\t %.2f \t %.2f \t\t %.2f"%(teacher_history.history['accuracy'][epoch-1]*100,student_scratch_history.history['accuracy'][epoch-1]*100,kd_history.history['accuracy'][epoch-1]*100))
print("Val. Acc. (%%) \t\t %.2f \t\t %.2f \t\t -"%(teacher_history.history['val_accuracy'][epoch-1]*100,student_scratch_history.history['val_accuracy'][epoch-1]*100))
print("Loss. (%%) \t\t %.2f \t\t %.2f \t\t %.2f"%(teacher_history.history['loss'][epoch-1]*100,student_scratch_history.history['loss'][epoch-1]*100,kd_history.history['loss'][epoch-1]*100))
print("Val. Loss. (%%) \t\t %.2f \t\t %.2f \t\t %.2f"%(teacher_history.history['val_loss'][epoch-1]*100,student_scratch_history.history['val_loss'][epoch-1]*100,kd_history.history['val_loss'][epoch-1]*100))
print("="*65)
print("Keras size (kb)\t\t%.2f\t\t%.2f\t\t%.2f"%(original_size,student_scratch_size,distiller_size))
print("tflite size (kb)\t%.2f\t\t%.2f\t\t%.2f"%(original_tflite_size,student_scratch_tflite_size,distiller_tflite_size))
print("Acc. (tflite)(%%)\t%.2f\t\t%.2f\t\t%.2f"%(teacher_tflite_acc*100,student_scratch_tflite_acc*100,distiller_tflite_acc*100))
print("Total Weight\t%d\t\t%d\t\t%d"%(teacher_param_total, student_scratch_param_total, distiller_std_param_total))
print("Zero Weight\t\t%d\t\t%d\t\t%d"%(teacher_param_zeros, student_scratch_param_zeros, distiller_std_param_zeros))
print("Sparsity\t\t%.2f\t\t%.2f\t\t%.2f"%(teacher_param_zeros/teacher_param_total*100, student_scratch_param_zeros/student_scratch_param_total*100, distiller_std_param_zeros/distiller_std_param_total*100))
print("="*65)
print("*No. of Weights in SM with Distiller Wrapper %d"%(distiller_param_total))
print("*No. of Weight with Zero Weight in SM with Distiller Wrapper %d"%(distiller_param_zeros))