In [1]:
import warnings
warnings.filterwarnings("ignore")

import json
import pprint
import numpy as np
import tempfile
from qkeras.autoqkeras import *
from qkeras import *
from qkeras.utils import model_quantize
from qkeras.qtools import run_qtools
from qkeras.qtools import settings as qtools_settings
from tensorflow.keras.initializers import *
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import *

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import tensorflow.nn as nn

2022-10-29 13:02:32.135460: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
"""
Let's read the training csvs

"""
import pandas as pd
import numpy as np

df = pd.read_csv("./higgs-boson/training/training.csv")
print(df.head()) # just printing what it looks like
df = df.drop(columns=['EventId', 'Weight']) # drop columns we don't use

   EventId  DER_mass_MMC  DER_mass_transverse_met_lep  DER_mass_vis  DER_pt_h  \
0   100000       138.470                       51.655        97.827    27.980   
1   100001       160.937                       68.768       103.235    48.146   
2   100002      -999.000                      162.172       125.953    35.635   
3   100003       143.905                       81.417        80.943     0.414   
4   100004       175.864                       16.915       134.805    16.405   

   DER_deltaeta_jet_jet  DER_mass_jet_jet  DER_prodeta_jet_jet  \
0                  0.91           124.711                2.666   
1               -999.00          -999.000             -999.000   
2               -999.00          -999.000             -999.000   
3               -999.00          -999.000             -999.000   
4               -999.00          -999.000             -999.000   

   DER_deltar_tau_lep  DER_pt_tot  ...  PRI_jet_num  PRI_jet_leading_pt  \
0               3.064      41.928  ...   

In [3]:
"""
We filter out bad values
"""
df = df.replace(-999.000, np.nan)
df = df.dropna()
print(df.head())

    DER_mass_MMC  DER_mass_transverse_met_lep  DER_mass_vis  DER_pt_h  \
0        138.470                       51.655        97.827    27.980   
5         89.744                       13.550        59.149   116.344   
6        148.754                       28.862       107.782   106.130   
11       114.744                       10.286        75.712    30.816   
23       141.481                        0.736       111.581   174.075   

    DER_deltaeta_jet_jet  DER_mass_jet_jet  DER_prodeta_jet_jet  \
0                  0.910           124.711                2.666   
5                  2.636           284.584               -0.540   
6                  0.733           158.359                0.113   
11                 2.563           252.599               -1.401   
23                 1.955           364.344               -0.923   

    DER_deltar_tau_lep  DER_pt_tot  DER_sum_pt  ...  PRI_met_sumet  \
0                3.064      41.928     197.760  ...        258.733   
5                1

In [4]:
"""
We get rid of EventId column. Label column is our y (output). Rest is X (input)
"""
X = df.iloc[:, :-1].to_numpy()
Y = df.iloc[:, -1].to_numpy() # just get Label
print(f"X shape: {X.shape}")
print(f"Y shape: {Y.shape}")
print(f"Y values b4: {Y}")
# turn the signal and background to 1 and 0 respectively
Y[Y == "s"] = 1
Y[Y == "b"] = 0
print(np.any(Y == "s") or np.any(Y == "b")) # check if there's any non number left
print(f"Y values after: {Y}")
Y = Y.astype('int32').reshape((-1,1))

X shape: (68114, 30)
Y shape: (68114,)
Y values b4: ['s' 'b' 's' ... 's' 's' 'b']
False
Y values after: [1 0 1 ... 1 1 0]


In [5]:
"""
We do the train test split. We don't shuffle the data just yet, we can do that 
pytorch data loader
"""
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

X_train shape: (54491, 30)
X_test shape: (13623, 30)
Y_train shape: (54491, 1)
Y_test shape: (13623, 1)


In [6]:
"""
Defining the DNN
We are going to be using qkeras for quantization, so we will be importing a model
made in the DNN demo, but in keras/tensforflow, instead of pytorch
"""

input_dim = X_train.shape[-1]
hidden_dim = 100
n_hidden_layers = 3




model = Sequential()
model.add(BatchNormalization(input_shape=(input_dim,), name='norm'))
model.add(Dense(hidden_dim, name='input_layer', kernel_regularizer=l2(0.0001)))

for idx in range(n_hidden_layers): # hidden layers
    model.add(Activation(activation="relu", name=f'activation_{idx}'))
    model.add(Dense(hidden_dim, name=f'fc{idx}', kernel_regularizer=l2(0.0001)))

model.add(Activation(activation="relu", name=f'activation_output'))
model.add(Dense(1, name='output', kernel_regularizer=l2(0.0001)))
model.add(Activation(activation='sigmoid', name='sigmoid'))

<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7fc8da288640>


2022-10-29 13:02:33.793853: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-10-29 13:02:33.794480: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-10-29 13:02:33.825450: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-29 13:02:33.825929: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:27:00.0 name: NVIDIA GeForce RTX 3090 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 82 deviceMemorySize: 23.67GiB deviceMemoryBandwidth: 871.81GiB/s
2022-10-29 13:02:33.825942: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2022-10-29 13:02:33.827852: I tensorflow/stream_executor/platform/d

In [7]:

# with cur_strategy.scope():
  # optimizer = Adam(lr=0.001)
  # model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"])
  # model.fit(X_train, Y_train, epochs=10, batch_size=128, steps_per_epoch=29, validation_data=(X_test, Y_test))

adam = Adam(lr=0.001)
model.compile(optimizer=adam, loss=['binary_crossentropy'])
model.fit(X_train, Y_train, batch_size=64,
            epochs=30, validation_split=0.20, shuffle=True,
            # callbacks = callbacks.callbacks
) # fit makes batchnorm to be in training mode


Epoch 1/10


2022-10-29 13:02:33.916045: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-10-29 13:02:33.940419: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3800060000 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
reference_internal = "fp32"
reference_accumulator = "fp32"

q = run_qtools.QTools(
    model,
    # energy calculation using a given process
    # "horowitz" refers to 45nm process published at
    # M. Horowitz, "1.1 Computing's energy problem (and what we can do about
    # it), "2014 IEEE International Solid-State Circuits Conference Digest of
    # Technical Papers (ISSCC), San Francisco, CA, 2014, pp. 10-14, 
    # doi: 10.1109/ISSCC.2014.6757323.
    process="horowitz",
    # quantizers for model input
    source_quantizers=[quantized_bits(8, 0, 1)],
    is_inference=False,
    # absolute path (including filename) of the model weights
    # in the future, we will attempt to optimize the power model
    # by using weight information, although it can be used to further
    # optimize QBatchNormalization.
    weights_path=None,
    # keras_quantizer to quantize weight/bias in un-quantized keras layers
    keras_quantizer=reference_internal,
    # keras_quantizer to quantize MAC in un-quantized keras layers
    keras_accumulator=reference_accumulator,
    # whether calculate baseline energy
    for_reference=True)
  
# caculate energy of the derived data type map.
energy_dict = q.pe(
    # whether to store parameters in dram, sram, or fixed
    weights_on_memory="sram",
    # store activations in dram or sram
    activations_on_memory="sram",
    # minimum sram size in number of bits. Let's assume a 16MB SRAM.
    min_sram_size=8*16*1024*1024,
    # whether load data from dram to sram (consider sram as a cache
    # for dram. If false, we will assume data will be already in SRAM
    rd_wr_on_io=False)

# get stats of energy distribution in each layer
energy_profile = q.extract_energy_profile(
    qtools_settings.cfg.include_energy, energy_dict)
# extract sum of energy of each layer according to the rule specified in
# qtools_settings.cfg.include_energy
total_energy = q.extract_energy_sum(
    qtools_settings.cfg.include_energy, energy_dict)

pprint.pprint(energy_profile)
print()
print("Total energy: {:.2f} uJ".format(total_energy / 1000000.0))


Instructions for updating:
Use ref() instead.
{'activation_0': {'energy': {'inputs': 190.19,
                             'op_cost': 0.0,
                             'outputs': 190.19,
                             'parameters': 0.0},
                  'total': 190.19},
 'activation_1': {'energy': {'inputs': 190.19,
                             'op_cost': 0.0,
                             'outputs': 190.19,
                             'parameters': 0.0},
                  'total': 190.19},
 'activation_2': {'energy': {'inputs': 190.19,
                             'op_cost': 0.0,
                             'outputs': 190.19,
                             'parameters': 0.0},
                  'total': 190.19},
 'activation_output': {'energy': {'inputs': 190.19,
                                  'op_cost': 0.0,
                                  'outputs': 190.19,
                                  'parameters': 0.0},
                       'total': 190.19},
 'fc0': {'energy': {'inputs':

In [9]:
quantization_config = {
        "kernel": {
                "binary": 1,
                "stochastic_binary": 1,
                "ternary": 2,
                "stochastic_ternary": 2,
                "quantized_bits(2,1,1,alpha=1.0)": 2,
                "quantized_bits(4,0,1,alpha=1.0)": 4,
                "quantized_bits(8,0,1,alpha=1.0)": 8,
                "quantized_po2(4,1)": 4
        },
        "bias": {
                "quantized_bits(4,0,1)": 4,
                "quantized_bits(8,3,1)": 8,
                "quantized_po2(4,8)": 4
        },
        "activation": {
                "binary": 1,
                "ternary": 2,
                "quantized_relu_po2(4,4)": 4,
                "quantized_relu(3,1)": 3,
                "quantized_relu(4,2)": 4,
                "quantized_relu(8,2)": 8,
                "quantized_relu(8,4)": 8,
                "quantized_relu(16,8)": 16
        },
        "linear": {
                "binary": 1,
                "ternary": 2,
                "quantized_bits(4,1)": 4,
                "quantized_bits(8,2)": 8,
                "quantized_bits(16,10)": 16
        }
}



In [10]:
limit = {
    "Dense": [8, 8, 4],
    "Conv2D": [4, 8, 4],
    "DepthwiseConv2D": [4, 8, 4],
    "Activation": [4],
    "BatchNormalization": []
}

In [11]:
goal = {
    "type": "energy",
    "params": {
        "delta_p": 8.0,
        "delta_n": 8.0,
        "rate": 2.0,
        "stress": 1.0,
        "process": "horowitz",
        "parameters_on_memory": ["sram", "sram"],
        "activations_on_memory": ["sram", "sram"],
        "rd_wr_on_io": [False, False],
        "min_sram_size": [0, 0],
        "source_quantizers": ["int8"],
        "reference_internal": "int8",
        "reference_accumulator": "int32"
        }
}

In [12]:


run_config = {
  "output_dir": tempfile.mkdtemp(),
  "goal": goal,
  "quantization_config": quantization_config,
  "learning_rate_optimizer": False,
  "transfer_weights": False,
  "mode": "random",
  "seed": 42,
  "limit": limit,
  "tune_filters": "layer",
  "tune_filters_exceptions": "^dense",
  # "distribution_strategy": cur_strategy,
  # first layer is input, layer two layers are softmax and flatten
  "layer_indexes": range(1, len(model.layers) - 1),
  "max_trials": 20
}

print("quantizing layers:", [model.layers[i].name for i in run_config["layer_indexes"]])

quantizing layers: ['input_layer', 'activation_0', 'fc0', 'activation_1', 'fc1', 'activation_2', 'fc2', 'activation_output', 'output']


In [13]:
autoqk = AutoQKeras(model, metrics=["acc"], custom_objects=custom_objects, **run_config)
# autoqk.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=128, epochs=20)
autoqk.fit(X_train, Y_train, validation_split=0.20, batch_size=128, epochs=20)


Trial 1 Complete [00h 00m 09s]
val_score: 0.6197229027748108

Best val_score So Far: 0.6197229027748108
Total elapsed time: 00h 00m 09s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
quantized_po2(4,1)|stochastic_binary |input_layer_kernel_quantizer
ternary           |binary            |fc0_kernel_quantizer
quantized_po2(4,1)|quantized_bits(...|fc1_kernel_quantizer
quantized_po2(4,1)|quantized_bits(...|fc2_kernel_quantizer
quantized_bits(...|stochastic_binary |output_kernel_quantizer
0.5               |1                 |network_filters_input_layer
quantized_bits(...|quantized_po2(4,8)|input_layer_bias_quantizer
quantized_relu(...|binary            |activation_0_activation_quantizer
0.5               |1.5               |network_filters_fc0
quantized_bits(...|quantized_bits(...|fc0_bias_quantizer
quantized_relu_...|quantized_relu_...|activation_1_activation_quantizer
0.5               |2                 |network_filters_fc1
quantized_bits(...|quantized_b

KeyboardInterrupt: 

In [None]:
qmodel = autoqk.get_best_model()
qmodel.save_weights("qmodel.h5")