In [None]:
##### Copyright 2020 Google LLC
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
import warnings
warnings.filterwarnings("ignore")
import tempfile

import numpy as np
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()

from tensorflow.keras.layers import Input, Dense, Embedding, SimpleRNN, GRU, LSTM, Bidirectional
from tensorflow.keras.optimizers import *
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

from qkeras.autoqkeras import *
from qkeras import *

print("using tensorflow", tf.__version__)

In [None]:
physical_devices = tf.config.list_physical_devices()
for d in physical_devices:
  print(d)

In [None]:
try:
  device_name = os.environ['COLAB_TPU_ADDR']
  TPU_ADDRESS = 'grpc://' + device_name
  print('Found TPU at: {}'.format(TPU_ADDRESS))
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
  tf.config.experimental_connect_to_cluster(resolver)
  # This is the TPU initialization code that has to be at the beginning.
  tf.tpu.experimental.initialize_tpu_system(resolver)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  strategy = tf.distribute.experimental.TPUStrategy(resolver)  
except KeyError:
  print('TPU not found')
  strategy = tf.distribute.get_strategy()

In [None]:
np.random.seed(12)
tf.random.set_seed(12)

In [None]:
max_features = 10000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
BATCH_SIZE = 1000
SHUFFLE_BUFFER_SIZE = 25000

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))

train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(SHUFFLE_BUFFER_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

train_dataset, test_dataset

## Using QKeras

__QKeras__ works by tagging all variables and weights/bias created by Keras as well as output of arithmetic layers by quantized functions. Quantized functions can be instantiated directly in __`QSimpleRNN`__/__`QLSTM`__/__`QGRU`__/__`QBidirectional`__/__`QDense`__/__`QConv2D`__/__`QSeparableConv2D`__ functions, and they can be passed to __`QActivation`__, which act as a merged quantization and activation function.

In order to successfully quantize a model, users need to replace layers that create variables (trainable or not) (`LSTM`, `Conv2D`, etc) by their equivalent ones in __QKeras__ (__`QLSTM`__/__`QDense`__, etc), and any layers that perform math operations need to be quantized afterwards.

Quantized values are clipped between their maximum and minimum quantized representation (which may be different than $[-1.0, 1.0]$), although for `po2` type of quantizers, we still recommend the users to specify the parameter for `max_value`.

An example of a very simple recurrent network is given below in Keras.

In [None]:
units = 64
embedding_dim = 64
loss = 'binary_crossentropy'

def create_model(batch_size=None):
  x = x_in = Input(shape=(maxlen,), batch_size=batch_size, dtype=tf.int32)
  x = Embedding(input_dim=max_features, output_dim=embedding_dim)(x)
  x = Activation('linear', name='embedding_act')(x)
  x = Bidirectional(LSTM(units))(x)
  x = Dense(1)(x)
  x = Activation('sigmoid')(x)
  model = tf.keras.Model(inputs=[x_in], outputs=[x])
  return model

In [None]:
tf.keras.backend.clear_session()
with strategy.scope():
  model = create_model(BATCH_SIZE)
  custom_objects = {}
  model.compile(
      optimizer=Adam(learning_rate=0.01),
      loss=loss,
      metrics=['acc'])

model.summary()
print('Train...')
model.fit(
    train_dataset,
    epochs=10,
    batch_size=BATCH_SIZE,
    validation_data=test_dataset,
    verbose=2)

Replacing with quantized layers

In [None]:
def create_qmodel(batch_size=None):
  x = x_in = Input(shape=(maxlen,), batch_size=batch_size, dtype=tf.int32)
  x = Embedding(input_dim=max_features, output_dim=embedding_dim)(x)
  x = QActivation('binary', name='embedding_act')(x)
  x = QLSTM(
    units,
    activation='quantized_tanh(4)',
    recurrent_activation='quantized_relu(4,0,1)',
    kernel_quantizer='stochastic_ternary("auto")',
    recurrent_quantizer='quantized_bits(2,1,1,alpha=1.0)',
    bias_quantizer='quantized_bits(4,0,1)')(x)
  x = QDense(
    1, 
    kernel_quantizer="quantized_bits(4,0,1)",
    bias_quantizer='quantized_bits(4,0,1)')(x)
  x = QActivation('sigmoid')(x)
  model = tf.keras.Model(inputs=[x_in], outputs=[x])
  return model

In [None]:
tf.keras.backend.clear_session()
with strategy.scope():
  qmodel = create_qmodel(BATCH_SIZE)
  custom_objects = {}
  qmodel.compile(
      optimizer=Adam(learning_rate=0.01),
      loss=loss,
      metrics=['acc'])

qmodel.summary()
print('Train...')
qmodel.fit(train_dataset,
          batch_size=BATCH_SIZE,
          epochs=10,
          verbose=2,
          validation_data=test_dataset)

# Converting a Model Automatically

In addition to the drop-in replacement of Keras functions, we have written the following function to assist anyone who wants to quantize a network.

__`model_quantize(model, quantizer_config, activation_bits, custom_objects=None, transfer_weights=False)`__

This function converts an non-quantized model (such as the one from `model` in the previous example) into a quantized version, by applying a configuration specified by the dictionary `quantizer_config`, and `activation_bits` specified for unamed activation functions, with this parameter probably being removed in future versions.

The parameter `custom_objects` specifies object dictionary unknown to Keras, required when you copy a model with lambda layers, or customized layer functions, for example, and if `transfer_weights` is `True`, the returned model will have as initial weights the weights from the original model, instead of using random initial weights.

The dictionary specified in `quantizer_config` can be indexed by a layer name or layer class name. In the example below, conv2d_1 corresponds to the first convolutional layer of the example, while  QConv2D corresponds to the default behavior of two dimensional convolutional layers. The reader should note that right now we recommend using __`QActivation`__ with a dictionary to avoid the conversion of activations such as `softmax` and `linear`.  In addition, although we could use `activation` field in the layers, we do not recommend that. 

`{
  "conv2d_1": {
      "kernel_quantizer": "stochastic_ternary",
      "bias_quantizer": "quantized_po2(4)"
  },
  "QConv2D": {
      "kernel_quantizer": "stochastic_ternary",
      "bias_quantizer": "quantized_po2(4)"
  },
  "QDense": {
      "kernel_quantizer": "quantized_bits(3,0,1)",
      "bias_quantizer": "quantized_bits(3)"
  },
  "act_1": "quantized_relu(2)",
  "QActivation": { "relu": "quantized_relu(2)" }
}`

In the following example, we will quantize the model using a different strategy.


In [None]:
bits = 4
quantizer_config = {
  "bidirectional": {
      'activation' : f"quantized_tanh({bits})",
      'recurrent_activation' : f"quantized_relu(4,0,1)",
      'kernel_quantizer' : f"quantized_bits({bits}, alpha='auto')",
      'recurrent_quantizer' : f"quantized_bits({bits}, alpha='auto')",
      'bias_quantizer' : f"quantized_bits({bits}, alpha='auto')",
  },
  "dense": {
      'kernel_quantizer' : f"quantized_bits({bits}), alpha='auto'",
      'bias_quantizer' : f"quantized_bits({bits}), alpha='auto'"
  },
  "embedding_act": f"quantized_bits({bits}), alpha='auto'",
}


In [None]:
tf.keras.backend.clear_session()
with strategy.scope():
  model = create_model(BATCH_SIZE)
  custom_objects = {}
  
  qmodel = model_quantize(model, quantizer_config, bits, custom_objects)
  qmodel.compile(
      optimizer=Adam(learning_rate=0.01),
      loss=loss,
      metrics=['acc'])
  
qmodel.summary()
print('Train...')
qmodel.fit(train_dataset,
          batch_size=BATCH_SIZE,
          epochs=10,
          verbose=2,
          validation_data=test_dataset)

## Quantizing a Model With `AutoQKeras`

To quantize this model with `AutoQKeras`, we need to define the quantization for kernels, biases and activations; forgiving factors and quantization strategy.

Below we define which quantizers are allowed for kernel, bias, activations and linear. Linear is a proxy that we use to capture `Activation("linear")` to apply quantization without applying a non-linear operation.  In some networks, we found that this trick may be necessary to better represent the quantization space.


In [None]:
tf.keras.backend.clear_session()
with strategy.scope():
  model = create_model(BATCH_SIZE)
  custom_objects = {}
  model.compile(
      optimizer=Adam(learning_rate=0.01),
      loss=loss,
      metrics=['acc'])

In [None]:
quantization_config = {
        "kernel": {
                "stochastic_binary": 1,
                "stochastic_ternary": 2,
                "quantized_bits(4,0,1,alpha=1.0)": 4,
                "quantized_po2(4,1)": 4
        },
        "recurrent_kernel": {
                "stochastic_binary": 1,
                "stochastic_ternary": 2,
                "quantized_bits(4,0,1,alpha=1.0)": 4,
                "quantized_po2(4,1)": 4
          
        },
        "recurrent_activation": {
                "quantized_relu(4,0,1)": 4          
        },
        "bias": {
                "quantized_bits(4,0,1)": 4,
                "quantized_po2(4,1)": 4
        },
        "activation" : {
            "stochastic_ternary('auto')": 2,
            "quantized_tanh(4)" : 4, 
            "quantized_relu_po2(4,1)": 4,
            "quantized_relu(4,2)": 4,
        },
        "linear": { 
                "stochastic_ternary('auto')" : 2,
                "quantized_tanh(4)" : 4, 
                "quantized_relu_po2(4,1)": 4,
                "quantized_relu(3,1)": 3,
                "quantized_relu(4,2)": 4,
        }
}

limit = {
    "Dense": [4],
    "Bidirectional": [4],
    "Activation": [4],
    "default" : [4]*4
}

goal = {
    "type": "bits",
    "params": {
        "delta_p": 8.0,
        "delta_n": 8.0,
        "rate": 2.0,
        "stress": 1.0,
        "input_bits": 4,
        "output_bits": 4,
        "ref_bits": 4,
        "config": {
            "default": ["parameters", "activations"]
        }
    }
}

run_config = {
  "output_dir": tempfile.mkdtemp(),
  "goal": goal,
  "quantization_config": quantization_config,
  "learning_rate_optimizer": False,
  "transfer_weights": False,
  "mode": "random",
  "seed": 42,
  "limit": limit,
  "tune_filters": "layer",
  "tune_filters_exceptions": "^dense",
  "distribution_strategy": strategy,

  "layer_indexes": range(2, len(model.layers) - 1),
  "max_trials": 1000
}

print("quantizing layers:", [model.layers[i].name for i in run_config["layer_indexes"]])

In [None]:
autoqk = AutoQKeras(model, metrics=["acc"], custom_objects={}, **run_config)
autoqk.fit(
  train_dataset, 
  validation_data=test_dataset, 
  batch_size=BATCH_SIZE, 
  epochs=10,
  verbose=2)

In [None]:
qmodel = autoqk.get_best_model()
qmodel.save_weights("qmodel.h5")

In [None]:
print_qmodel_summary(qmodel)
print(get_quantization_dictionary(qmodel))