# BlueSeer - Training, Quantization, and Export

In [None]:
import os, io, time, copy

import sklearn
import tensorflow as tf
import tensorflow_model_optimization as tfmot
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import absl
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pickle
import seaborn as sn

# Remove Tensorflow C-level logging and warnings
tf.get_logger().setLevel('ERROR')
absl.logging.set_verbosity(absl.logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Force font embedding when creating figure as PDF
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# 1. Dataset

In [None]:
train_set_path = "../dataset/train"
test_set_path = "../dataset/test"
output_path = "./blueseer_model.cc"

base_path = "./"

MODELS_DIR = os.path.join(base_path, 'models')
if not os.path.exists(MODELS_DIR):
    os.mkdir(MODELS_DIR)
MODEL_TF = MODELS_DIR + 'model'
MODEL_TFLITE = MODELS_DIR + 'model.tflite'
MODEL_TFLITE_CPP = MODELS_DIR + 'blueseer_model.cc'

### Environment Category Labelling

In [None]:
# Combine subclasses from two different data collection periods into one common category bbreakdown
CLASS_MAPPING = {
        'street':'street',
        'park':'nature',
        'apartment':'home', 
        'supermarket':'shopping', 
        'clothing_store':'shopping', 
        'train':'transport', 
        'bus':'transport', 
        'gym':'entertainment', 
        'car':'transport', 
        'house':'home', 
        'nature':'nature', 
        'restaurant':'restaurant', 
        'cinema':'entertainment', 
        'concert':'entertainment', 
        'plane':'transport',
        'bar':'restaurant',
        'shopping':'shopping',
        'transport':'transport',
        'home':'home',
        'office':'office',
        'mensa':'restaurant',
        'lecture':'university', 
    }

def original_classes_to_blueseer_classes(Y_labels):
    mapping = CLASS_MAPPING.get
    return [mapping(label, label) for label in Y_labels]

### Parsing collected samples

In [None]:
# Find all paths to .CSV files
def find_csv_filenames(path_to_dir, suffix=".CSV"):
    filenames = []
    for sub in os.walk(path_to_dir):
        if sub[0] != path_to_dir:
            # find all .CSV files within the folder
            filenames += [sub[0] + "/" +
            filename for filename in os.listdir(sub[0]) if filename.endswith(suffix)]
    filenames.sort()
    return filenames


# Parse .CSV files into pandas Dataframes
def parse_files_to_df(files):
    dataframes = []
    num_files = len(files)
    i=0
    last_displayed = -1
    for f in files:
        df = pd.read_csv(f)
        dataframes.append(df)
        i+=1
        if int(i/num_files*100) != last_displayed:
            print(f"{int(i/num_files*100)}% loaded ({i}/{num_files})")
            last_displayed = int(i/num_files*100)
    return dataframes



#most common services in selected environments
SERVICES = [	"0af0", "1802", "180f", "1812", "1826", "2222", "ec88", "fd5a",
    "fd6f", "fdd2", "fddf", "fe03", "fe07", "fe0f", "fe61", "fe9f",
    "fea0", "feb9", "febe", "fee0", "ff0d", "ffc0", "ffe0"]

# features collected that are not used when creating the dataset
MUST_REMOVE_COLUMNS = [" services", " manufacturer_data_lengths",' time_point_1', ' time_point_2', ' time_point_3']

def process_files(dataframes,
                  without_services = False,
                  only_labels = None,
                  remove_columns = None,
                  verbose=False,
                 ):

    X_datapoints = []
    Y_labels = []
    available_columns = dataframes[0].columns.tolist()
    # remove label from columns
    available_columns.remove("label")
    # remove deprecated columns that might exist in the dataset
    for col in MUST_REMOVE_COLUMNS:
        try:
            available_columns.remove(col) # there must be a leading space!
        except Exception as e:
            #print(f"{col} not found in dataset")
            pass
    # remove any column we wish to get rid of
    if remove_columns is not None:
        for col in remove_columns:
            try:
                available_columns.remove(col) # there must be a leading space!
            except Exception as e:
                #print(f"{col} not found in dataset")
                pass
    # if we want to get rid of the 23most-common services, do so now
    if without_services:
        for serv in SERVICES:
            available_columns.remove(f" {serv}") # there is always a leading space!

    # for-each dataframe, add it to the dataset
    for df in dataframes:
        # Find label from first row
        label = df.iloc[0]["label"]
        X_datapoints.append(df[available_columns].to_numpy().flatten()) # .iloc[:num_scans-1]
        Y_labels.append(label)
    
    X_datapoints = np.array(X_datapoints)
    
    # Update classes to DAC classes
    Y_labels = original_classes_to_blueseer_classes(Y_labels)
    Y_labels = np.array(Y_labels)
    
    # Remove entertainment class & samples from dataset
    data_to_keep = Y_labels!='entertainment'#np.array([y!='entertainment' for y in Y_labels])
    X_datapoints = X_datapoints[data_to_keep]
    Y_labels = Y_labels[data_to_keep]
    Y_labels = list(Y_labels)
    Y_labels = np.array(Y_labels)
    # university
    data_to_keep = Y_labels!='university'#np.array([y!='entertainment' for y in Y_labels])
    X_datapoints = X_datapoints[data_to_keep]
    Y_labels = Y_labels[data_to_keep]
    Y_labels = list(Y_labels)

    # Check how many samples per environment where found
    unique_labels = set(Y_labels)
    if verbose:
        print(unique_labels)
        for lbl in unique_labels:
            print(f"{lbl}: {Y_labels.count(lbl)}")
    unique_labels = list(unique_labels)
    
    return X_datapoints, Y_labels, unique_labels, available_columns




### Data Normalization

In [None]:
# Z-score normalization: find Mean and Standard deviation of the distribution
def get_normalization_params(X):
    X = np.array(X)
    # Extract mean and std, per-feature
    feature_mean = np.nanmean(X,axis=0)
    feature_std = np.nanstd(X,axis=0)
    return feature_mean, feature_std

# X_normalized = (X - mean(X)) / STD(X)
def normalize_data(X,
                   Y_str,
                   feature_mean,
                   feature_std,
                   labels,
                   ):
    X = np.array(X)
    X = (X-feature_mean)/(feature_std+np.finfo(float).eps)
    # Transform Y data from string to integer
    Y = np.zeros((len(Y_str),))
    for i in range(len(Y_str)):
        Y[i] = labels.index(Y_str[i])
    Y = np.array(Y,dtype=np.int8)
    return X, Y

In [None]:
# Generate C++ friendly representation of the normalization paramaters
# The function is called when generation the C++ code of BlueSeer
def generate_normalization_parameters_CPP(mean_list, std_list, labels):
    mean_str = "const float mean_list[] = {"
    for i in range(0, len(mean_list)):
        if i != 0:
            mean_str += ", "
        mean_str += str(mean_list[i])
    mean_str += "};"
    
    std_str = "const float std_list[] = {"
    for i in range(0, len(std_list)):
        if i != 0:
            std_str += ", "
        std_str += str(std_list[i])
    std_str += "};"

    labels_str = "const char available_env[][16] = {"
    for i in range(0, len(labels)):
        if i != 0:
            labels_str += ", "
        labels_str += "\""+labels[i]+"\""
    labels_str += "};"
    return mean_str, std_str, labels_str

### Class Equalization

In [None]:
def equalize_class_distribution(X, Y, remove_class=[]):
    # Separate each class instance into bins
    classes = list(np.unique(Y))
    try:
        classes.remove(remove_class)
    except Exception as e:
        pass
    per_class_X = dict()
    smallest_class = 999999
    for cls in classes:
        per_class_X[cls] = X[Y==cls]
        smallest_class = per_class_X[cls].shape[0] if per_class_X[cls].shape[0] < smallest_class else smallest_class
    # Shuffle data, keep an equal number of instances per class
    for cls in classes:
        np.random.shuffle(per_class_X[cls])
        per_class_X[cls] = per_class_X[cls][:smallest_class]
    # recombine into one array
    X_new = np.concatenate([per_class_X[cls] for cls in classes], axis=0)
    Y_new = np.array([cls for cls in classes for i in range(smallest_class)])
    return X_new, Y_new

### Dataset Preparation

In [None]:
def prepare_datasets(training_path,
                     test_path,
                     without_services=True,
                     verbose=False,
                    ):
    if verbose:
        print("Loading the training dataset. This can take a few minutes.")
    train_dataframes = parse_files_to_df(find_csv_filenames(training_path))
    if verbose:
        print("Loading test dataset.")
    test_dataframes = parse_files_to_df(find_csv_filenames(test_path))
    if verbose:
        print(f"Training set: {len(train_dataframes)} samples")
        print(f"Test set: {len(test_dataframes)} samples")

    if verbose:
        print("Processing the training CSV files.")
    X_train, Y_train, labels, features = process_files(train_dataframes,
                                                         without_services=without_services,
                                                         verbose=verbose,
                                                        )
    # get per-feature normalization (mean and std)
    feature_mean, feature_std = get_normalization_params(X=X_train)
    # normalize dataset
    X_train, Y_train = normalize_data(X_train,
                                          Y_train,
                                          feature_mean,
                                          feature_std,
                                          labels)
    if verbose:
        print("Processing the test CSV files.")
    X_test, Y_test, _, _ = process_files(test_dataframes,
                                         only_labels = labels,
                                         without_services=without_services,
                                         verbose=verbose,
                                        )
    X_test, Y_test = normalize_data(X_test,
                                        Y_test,
                                        feature_mean,
                                        feature_std,
                                        labels
                                       )
    
    # equalizing eval dataset
    print("equalizing test set.")
    X_test,Y_test = equalize_class_distribution(X_test, Y_test)
    
    prepared = dict()
    prepared['training'] = (X_train, Y_train)
    prepared['test'] = (X_test, Y_test)
    prepared['feature_mean'] = feature_mean
    prepared['feature_std'] = feature_std
    prepared['labels'] = labels
    prepared['features'] = features
    
    try:
        f =  open('prepared_dataset.pickle', 'wb')
        pickle.dump(prepared, f)
    except Exception as e:
        print(e)
    
    return prepared

def load_prepared_datasets(filename='prepared_dataset.pickle'):
    f =  open(filename, 'rb')
    prepared = pickle.load(f)
    return prepared

# Divide training dataset into train and test sets
def split_training_set(X_train, Y_train):
    return X_train, Y_train, None, None

# 2. BlueSeer Neural Network Model

In [None]:
def get_BlueSeer_model(num_classes):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(units=500,activation="relu",kernel_regularizer=tf.keras.regularizers.l2(1e-5),name="layer0_dense"))
    model.add(tf.keras.layers.Dropout(0.5,name="layer0_dropout"))
    model.add(tf.keras.layers.Dense(units=num_classes,activation="softmax",kernel_regularizer=tf.keras.regularizers.l2(1e-5),name="layer2_class_output"))
    return model


# learning Rate decay
def lr_scheduler(epoch, lr):
    if epoch < 4:
        return 0.01
    else:
        return lr * tf.math.exp(-0.1)


def get_smallest_model(num_classes, input_shape):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(units=num_classes,input_shape=input_shape,activation="softmax",kernel_regularizer=tf.keras.regularizers.l2(1e-5),name="layer2_class_output"))
    return model


# 3. Training

In [None]:
def plot_results(history):
      # plot loss during training
      plt.subplot(211)
      plt.title('Loss')
      plt.plot(history.history['loss'], label='train')
      plt.plot(history.history['val_loss'], label='test')
      plt.legend()
      # plot accuracy during training
      plt.subplot(212)
      plt.title('Accuracy')
      plt.plot(history.history['sparse_categorical_accuracy'], label='train')
      plt.plot(history.history['val_sparse_categorical_accuracy'], label='test')
      plt.legend()
      plt.show()

In [None]:
def train_model(X_train,
                Y_train,
                X_val,
                Y_val,
                labels,
                epochs=20,
                batch_size=32,
                verbose=0,
               ):
    
    model = get_BlueSeer_model(len(labels))
    
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01,
                                        decay=1e-6,
                                        momentum=0.9)
    # Exponentially Decreasing Learning Rate Decay
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer,
                  metrics='sparse_categorical_accuracy'
                 )
    
    model.build(input_shape=(None,115))
    model.summary()

    # Step 1: Training
    history = model.fit(X_train,
                        Y_train,
                        epochs=epochs,
                        validation_data=(X_val, Y_val),
                        batch_size=batch_size,
                        callbacks=[lr_callback],
                        verbose=verbose,
                       )
    if verbose:
        plot_results(history)
    
    # Step 2: Quantization-Aware Training (fine-tuning)
    model = tfmot.quantization.keras.quantize_model(model)
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01,
                                        decay=1e-6,
                                        momentum=0.9)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer,
                  metrics='sparse_categorical_accuracy'
                 )
    history = model.fit(X_train,
                          Y_train,
                          epochs=epochs//2,
                          validation_data=(X_val, Y_val),
                          batch_size=batch_size,
                          callbacks=[lr_callback],
                          verbose=verbose,
                          )
    
    return model, history.history['val_sparse_categorical_accuracy'][-1]
        

In [None]:
prepared_dict = None
try:
    prepared_dict = load_prepared_datasets()
except Exception as e:
    prepared_dict = prepare_datasets(train_set_path, test_set_path, without_services=True, verbose=True)

(X_train_original,Y_train_original) = prepared_dict['training']
(X_test, Y_test) = prepared_dict['test']
labels = prepared_dict['labels']
feature_mean = prepared_dict['feature_mean']
feature_std = prepared_dict['feature_std']
feature_std = np.maximum(feature_std, 1.0)
features = prepared_dict['features']

In [None]:
best_model = None
best_acc = 0.0
for i in range(10):
    # Create test split
    X_train, X_val, Y_train, Y_val = train_test_split(X_train_original, Y_train_original, test_size=0.1)
    # Create and eval model
    model, val_acc = train_model(X_train,Y_train,
                                  X_val,Y_val,
                                  labels,
                                  verbose=1,
                                 )
    loss, test_acc = model.evaluate(X_test, Y_test)
    if test_acc > best_acc:
        best_model = model

# 4. Evaluation

In [None]:
loss, test_acc = best_model.evaluate(X_test, Y_test)
print(f"Model Accuracy on the unseen test data: {test_acc*100:.2f}%")

### Confusion Matrix

In [None]:
cm = confusion_matrix(Y_test, np.argmax(best_model.predict(X_test),axis=1))
print(cm)
max_v = np.sum(cm[0])
cm = pd.DataFrame(cm, index = [lbl.capitalize() for lbl in labels],
                  columns = [lbl.capitalize() for lbl in labels])

plt.figure(figsize = (4,4))
ax = sn.heatmap(cm/max_v*100,
           annot=True,
           fmt='.1f',
           cmap="Blues",
           cbar=False,
              )
ax.set_ylabel("True Class", fontdict= {'fontweight':'bold'})
ax.set_xlabel("Predicted Class", fontdict= {'fontweight':'bold'})

# plt.tight_layout()
# matplotlib.rcParams.update({'font.size': 10})
plt.show()

# 5. Model Conversion & Export to C++

In [None]:
def convert_model(original_model, X_train):
    converter = tf.lite.TFLiteConverter.from_keras_model(original_model)
    # Set the optimization flag
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    # Enforce integer only quantization
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]
    # We keep float for input and output
    converter.inference_input_type = tf.float32
    converter.inference_output_type = tf.float32
    # Provide a representative dataset to ensure we quantize correctly
    def representative_dataset():
        for i in range(len(X_train)):
            yield([np.float32(X_train[i]).reshape(1, len(X_train[0]))])
    converter.representative_dataset = representative_dataset
    # Convert model
    embedded_model = converter.convert()
    # Temporary save the model to measure its size
    open("temp_model", "wb").write(embedded_model)
    size = os.path.getsize("temp_model")
    
    return embedded_model, size

def export_model(model_tflite,
                 target_filename,
                 feature_mean,
                 feature_std,
                 labels,
                 X_train,
                 verbose = True):
    
    # Get normalization as string, to transform into C-compliant file
    mean_str, std_str,labels_str = generate_normalization_parameters_CPP(mean_list=feature_mean,
                                                         std_list= feature_std,
                                                         labels=labels)
    tflite_filename = f"{target_filename}_tflite"
    tflm_filename = f"{target_filename}_tflm"
    open(tflite_filename, "wb").write(model_tflite)

    # Convert to a C source file, i.e, a TensorFlow Lite for Microcontrollers model
    !xxd -i {tflite_filename} > {tflm_filename}
    tflite_filename = tflite_filename.replace('/', '_').replace('.', '_')
    !sed -i '' -e 's/'{tflite_filename}'/g_model/g' {tflm_filename}
    
    !rm -f ./constants.cc
    model_str = "alignas(16) const unsigned char bluesser_model[] = "
    with open(tflm_filename, 'r') as file:
        data = file.read();
        model_str += data[data.index("{"): len(data)].replace("unsigned", "const")

    output_str = ""
    output_str += "#include \"constants.h\"\n"
    output_str += mean_str +"\n"
    output_str += std_str + "\n"
    output_str += labels_str + "\n"
    output_str += "const int available_env_len = "+str(len(labels)) +";\n"
    output_str += model_str

    with open(target_filename, "w") as file:
        file.write(output_str)

In [None]:
model_tflite, size = convert_model(best_model, X_train)
export_model(model_tflite,"models/blueseer_model.cc",feature_mean,feature_std,labels,X_train)
print(f"Model converted! Final Model Size: {size/1000} KB")