# Introduction
This code tests our work on Breast Cancer Wisconsin (Diagnostic) Data Set (WDBCD) (https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data)

In [None]:
# Uninstall the current TF version and install a compatible version with tfmot
!pip uninstall -y tensorflow
!pip install tensorflow==2.13.0
!pip install tensorflow-model-optimization==0.7.5

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

#import os
#print(os.listdir("../input"))
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

%matplotlib inline 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Conv1D, MaxPool1D,Flatten,Dense,Dropout,BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import utils



#Import models from scikit learn module:
from sklearn import datasets,metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

#Measuring Time
import time


# Parameters

In [None]:
epoch=50

# Loading Dataset

In [None]:
#load dataset
data = pd.read_csv("/kaggle/input/wdbc-qt-data/data.csv",header = 0) 
print(type(data))
data.head(5)

# Data Preprocessing
## Get classes

In [None]:
Y=data.diagnosis

print(Y.value_counts())
Y.head(5)

In [None]:
plt.title('Count of cancer type')
sns.countplot(x="diagnosis",data=data)
plt.ylabel('Count')
plt.show()

## Prepare and clean the dataset

In [None]:
data.isnull().any().describe()

In [None]:
data.info()

## Remove unwanted info

In [None]:
# drop id and unnamed 32 colums from the features
# we still need diagnosis for further statistics, 
# it will be dropped later

data.drop(['id','Unnamed: 32'],axis=1,inplace=True)
data.info(),

In [None]:
data.describe()

## Analyse usable data

In [None]:
for i in (data.columns[1:6]):
    plt.subplot(1,2,1)
    data[i][data['diagnosis']=='B'].plot.hist(alpha=0.5,title=i,color='green')
    data[i][data['diagnosis']=='M'].plot.hist(alpha=0.5,color='red')
    plt.legend(['B','M'],loc='upper right')
    #plt.grid(visible=True)
    
    
    plt.subplot(1,2,2)
    sns.boxplot(x="diagnosis", y=i, data=data)
    plt.show()

## Observations*
1. Mean values of cell like radius, perimeter, area, compactness, concavity,concave points etc can be used in classification of the cancer. Larger values of these parameters tends to show a correlation with malignant tumors.
2. Mean values of column like texture, smoothness, symmetry or fractual dimension does not show a particular preference of one diagnosis over the other. In any of the histograms there are no noticeable large outliers that warrants further cleanup.

## Remove input from data

In [None]:
#Diagnosis is in index 0, 
#so we start copying from index 1
#see, I told you we will drop it ;-)

data.drop('diagnosis',axis=1,inplace=True)
data.head(5)

## Correlation matrix

In [None]:
# Create correlation matrix
data.corr()

In [None]:
correlation=data.corr()

# Getting the Upper Triangle of the co-relation matrix
matrix = np.triu(correlation)
plt.figure(figsize=(40,16))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='copper',mask=matrix)
plt.title('Correlation between different fearures')
#plt.savefig("cor.svg")

In [None]:
type(data)

## Get the input (X)

In [None]:
#Copying data to X for bravity
X=data

#for emulating user
temp=pd.DataFrame(data=[data.values.tolist()[0]],columns=data.columns.values.tolist())
del data

#Verify :-D
print(type(X))
X.head(5)

## PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit_transform(X)

In [None]:
pca.get_covariance()

In [None]:
explained_variance=pca.explained_variance_ratio_
explained_variance

In [None]:
# We used logarithmic scale in the y-axis becaus the first value is so high

with plt.style.context('dark_background'):
    fig=plt.figure(figsize=(10, 6))
    ax = fig.add_subplot()

    ax.bar(range(30), explained_variance, alpha=0.5, align='center',
            label='individual explained variance')
    ax.set_yscale('log')
    
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.grid(visible=True)
    plt.tight_layout()

Thus we can see from the above plot that first three components constitute almost 73% of the variance. Fourth to twenty sixth components has 25% of the data sprad.The remaining component has less than 0.001% of the variance.Hence we can drop from 27 to 30th component(total of 4 features)



In [None]:
pca=PCA(n_components=26)
X_new=pca.fit_transform(X)
X_new

In [None]:
pca.get_covariance()

In [None]:
explained_variance=pca.explained_variance_ratio_
explained_variance

In [None]:
# We used logarithmic scale in the y-axis becaus the first value is so high

with plt.style.context('dark_background'):
    fig=plt.figure(figsize=(20, 12))
    ax = fig.add_subplot()

    ax.bar(range(26), explained_variance, alpha=0.5, align='center',
            label='individual explained variance')
    ax.set_yscale('log')
    
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.grid(visible=True)
    plt.tight_layout()

In [None]:
X=X_new
(X.shape, Y.shape)

## Convert to one-hot

In [None]:
#Check befor conversion
print("Converting to categorical")
print("Before Conversion:")
print(Y.shape)
print(type(Y))


#Convert to Categorical values
Y = Y.map({'B':0,'M':1})
Y = utils.to_categorical(Y, num_classes=2)


#verify shape
print("After Conversion:")
print(Y.shape)
type(Y)

## Splitting data into training and testing

In [None]:
(X.shape, Y.shape)

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,Y,test_size=0.15,stratify=Y)

In [None]:
y_test.shape

## Scaling

In [None]:
scaler=StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(X_train.shape, X_test)

## Reshaping

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.shape[1]

# Create teacher and student model

In [None]:
# Create the teacher
teacher =keras.Sequential()
teacher.add(keras.Input(shape=(X_train.shape[1],1)))
teacher.add(keras.layers.Conv1D(filters=64,kernel_size= 2,activation='relu'))
teacher.add(keras.layers.BatchNormalization())
teacher.add(keras.layers.Dropout(0.2))
        
teacher.add(keras.layers.Conv1D(448, 2,activation='relu'))
teacher.add(keras.layers.BatchNormalization())
teacher.add(keras.layers.Dropout(0.2))
        
teacher.add(keras.layers.Flatten())
teacher.add(keras.layers.Dense(64,activation='relu'))
teacher.add(keras.layers.Dropout(0.2))

teacher.add(keras.layers.Dense(2,activation='sigmoid'))

teacher.compile(optimizer=Adam(learning_rate=0.0001),loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
teacher.summary()

## Teacher Model training

In [None]:
start_time = time.time()
teacher_history=teacher.fit(X_train,y_train,epochs=epoch,validation_data=(X_test,y_test),verbose=1)
end_time = time.time()
print( sep='\n')

In [None]:
_, baseline_model_accuracy = teacher.evaluate(
    X_test, y_test, verbose=0)

print('Baseline test accuracy:', baseline_model_accuracy)
teacher.save("teacher_model.keras") 

In [None]:
# Get predicted probabilities from the model
y_pred_probs = teacher.predict(X_test)

# Convert sigmoid outputs to class labels by picking the class with highest probability
y_pred = np.argmax(y_pred_probs, axis=1)

# Convert one-hot true labels to class indices
y_true = np.argmax(y_test, axis=1)

In [None]:
def plot_confusion_matrix(y_true, y_pred, class_names, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(title)
    plt.tight_layout()
    plt.show()

# Define your class labels
class_names = ['B', 'M']

# Plot
plot_confusion_matrix(y_true, y_pred, class_names, "Base Model Confusion Matrix")

## Post-Training Quantization

### Saving TFLite without Quantization

In [None]:
# Save the original teacher model
teacher.export("teacher_model")  # Keras 3.x export to SavedModel format

In [None]:
# Convert to a quantized TFLite model
converter = tf.lite.TFLiteConverter.from_saved_model("teacher_model")
no_quantized_model = converter.convert()

with open("teacher_model.tflite", "wb") as f:
    f.write(no_quantized_model)


### Saving TFLite with Quantization

In [None]:
# Convert to a quantized TFLite model
converter = tf.lite.TFLiteConverter.from_saved_model("teacher_model")
converter.optimizations = [tf.lite.Optimize.DEFAULT]
quantized_model = converter.convert()

with open("teacher_model_ptq.tflite", "wb") as f:
    f.write(quantized_model)


### Saving TFLite with Integer-only Quantization

In [None]:
def representative_data_gen():
  for input_value in tf.data.Dataset.from_tensor_slices(X_train).batch(1).take(100):
    # input_value in FLOAT64, so we convert it to FLOAT32
    input_data = np.array(input_value, dtype=np.float32)
    yield [input_data]


# Convert to a quantized TFLite model
converter = tf.lite.TFLiteConverter.from_saved_model("teacher_model")
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen

# Ensure that if any ops can't be quantized, the converter throws an error
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]

# Set the input and output tensors to uint8 (APIs added in r2.3)
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8

quantized_int_model = converter.convert()


with open("teacher_model_ptq_int.tflite", "wb") as f:
    f.write(quantized_int_model)


# Evaluate the Quantized Model

In [None]:
def evaluate_tflite_model(tflite_path, X_test, y_test):
    import numpy as np
    import tensorflow as tf

    interpreter = tf.lite.Interpreter(model_path=tflite_path)
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    input_index = input_details[0]['index']
    output_index = output_details[0]['index']

    correct = 0
    y_pred=[]
    y_true=[]
    for i in range(len(X_test)):
        input_data = np.expand_dims(X_test[i], axis=0).astype(np.float32)

        interpreter.set_tensor(input_index, input_data)
        interpreter.invoke()
        output = interpreter.get_tensor(output_index)  # shape (1, 2)

        predicted = np.argmax(output[0])       # pick class with highest sigmoid score
        actual = np.argmax(y_test[i])          # one-hot to label
        y_pred.append(predicted)
        y_true.append(actual)

        if predicted == actual:
            correct += 1

    accuracy = correct / len(X_test)
    print(f"TFLite model accuracy: {accuracy * 100:.2f}%")
    return accuracy, y_true, y_pred


In [None]:
print(f'Baseline test accuracy:, {baseline_model_accuracy * 100:.2f} %') 
acc, y_true, y_pred= evaluate_tflite_model("teacher_model_ptq.tflite", X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
print("\nðŸ“Š Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

#print("\nðŸ§¾ Classification Report:")
#print(classification_report(y_true, y_pred))

In [None]:
plot_confusion_matrix(y_true, y_pred, class_names, "Quantization Model Confusion Matrix")

## Model File Sizes
### Confirming the parameter datatype
#### PTQ 

In [None]:
interpreter = tf.lite.Interpreter(model_path="teacher_model_ptq.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Input dtype:", input_details[0]['dtype'])
print("Output dtype:", output_details[0]['dtype'])

#### PTQ Int-Only

In [None]:
interpreter = tf.lite.Interpreter(model_path="teacher_model_ptq_int.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Input dtype:", input_details[0]['dtype'])
print("Output dtype:", output_details[0]['dtype'])

### Measuring the Sizes

In [None]:
import os

original_size = os.path.getsize("teacher_model.keras") / 1024
no_quant_size = os.path.getsize("teacher_model.tflite") / 1024
quant_size = os.path.getsize("teacher_model_ptq.tflite") / 1024
quant_int_size = os.path.getsize("teacher_model_ptq_int.tflite") / 1024

print(f"Original size: {original_size:.2f} KB")
print(f"TFLite Original size: {no_quant_size:.2f} KB")
print(f"Quantized size: {quant_size:.2f} KB")
print(f"Quantized Int-Only size: {quant_int_size:.2f} KB")

## Accuracy and loss of the Based model

In [None]:
def plotLearningCurve(history,epochs):
  epochRange = range(1,epochs+1)
  plt.plot(epochRange,history.history['accuracy'])
  plt.plot(epochRange,history.history['val_accuracy'])
  plt.title('Model Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend(['Train','Validation'],loc='lower right')
  plt.grid(visible=True)
  plt.show()

  plt.plot(epochRange,history.history['loss'])
  plt.plot(epochRange,history.history['val_loss'])
  plt.title('Model Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend(['Train','Validation'],loc='upper right')
  plt.grid(visible=True)
  plt.show()

In [None]:
plotLearningCurve(teacher_history,epoch)

## PTQ Accuracies
### Function for PTQ Accuracy

In [None]:
def evaluate_tflite_model(tflite_path, X_test, y_test):
    interpreter = tf.lite.Interpreter(model_path=tflite_path)
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    correct = 0
    for i in range(len(X_test)):
        input_data = np.expand_dims(X_test[i], axis=0).astype(np.float32)
        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details[0]['index'])
        predicted = np.argmax(output)
        actual = np.argmax(y_test[i])
        if predicted == actual:
            correct += 1

    accuracy = correct / len(X_test)
    return accuracy


### Function for Integer-Only PTQ

In [None]:
def evaluate_tflite_int_model(tflite_path, X_test, y_test):
    interpreter = tf.lite.Interpreter(model_path=tflite_path)
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]

    correct = 0
    for i in range(len(X_test)):
        #Scale the input to the proper conversion
        if input_details['dtype'] == np.uint8:
          input_scale, input_zero_point = input_details["quantization"]
          feat = X_test[i] / input_scale + input_zero_point
        
        input_data = np.expand_dims(feat, axis=0).astype(input_details["dtype"])
        interpreter.set_tensor(input_details['index'], input_data)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details['index'])
        predicted = np.argmax(output)
        actual = np.argmax(y_test[i])
        if predicted == actual:
            correct += 1

    accuracy = correct / len(X_test)
    return accuracy


In [None]:
teacher_model_acc = evaluate_tflite_model("/kaggle/working/teacher_model.tflite", X_test, y_test)
ptq_model_acc = evaluate_tflite_model("/kaggle/working/teacher_model_ptq.tflite", X_test, y_test)
ptq_int_model_acc = evaluate_tflite_int_model("/kaggle/working/teacher_model_ptq_int.tflite", X_test, y_test)

print("Teacher:",teacher_model_acc)
print("Quantization:",ptq_model_acc)
print("Quantization Int-Only:",ptq_int_model_acc)

## Total Parameters

In [None]:
tot_param=0
for p in (teacher.weights):
    layer_prod=1
    for in_p in p.shape:
        layer_prod = layer_prod*in_p
        
    tot_param = tot_param + layer_prod

print(tot_param)

# Result Summary

In [None]:
print("Metric\t\t\tTeacher\t\tPTQ\t\tPTQ Int-Only")
print("="*70)

print("tflite File size (kb)\t%.2f\t\t%.2f\t\t%.2f"%(no_quant_size,quant_size,quant_int_size))
print("Accuracy (%%)\t\t%.2f\t\t%.2f\t\t%.2f"%(teacher_model_acc*100,ptq_model_acc*100,ptq_int_model_acc*100))
print("Total Parameters\t%d\t\t-\t\t-"%(tot_param))
print("="*70)
print("*File size for Keras format %.2f"%(original_size))