# Implement Resizing and Normalization Layers

In [19]:
import pathlib
import numpy as np
import math

import tensorflow as tf 
from tensorflow.keras import layers

In [20]:
data_dir = pathlib.Path('data')

In [21]:
def get_waveform(wavfile):
    x = tf.io.read_file(str(wavfile))
    x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000)
    return tf.squeeze(x, axis=-1)


def get_spectrogram(waveform):
    # Convert the waveform to a spectrogram via a STFT
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)

    # Obtain the magnitude of the STFT
    spectrogram = tf.abs(spectrogram)

    # Add a 'channels' dimension, so that the spectrogram can be used as an
    # image-like input data w/ convolution layers, which expect shape
    # (batch_size, height, width, channels)
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

## Get the input data

In [22]:
waveform = get_waveform(data_dir/'test/h_yes.wav')
spec = get_spectrogram(waveform)
input_data = spec[tf.newaxis,...]
input_data.shape, input_data[0,0,:10,:]

(TensorShape([1, 124, 129, 1]),
 <tf.Tensor: shape=(10, 1), dtype=float32, numpy=
 array([[0.00021577],
        [0.00242205],
        [0.00827248],
        [0.01238501],
        [0.01051275],
        [0.01144113],
        [0.01872324],
        [0.02427843],
        [0.04116756],
        [0.09470174]], dtype=float32)>)

## Load the model

In [23]:
h5_model = tf.keras.models.load_model('simple_audio.h5')
h5_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing (Resizing)         (None, 32, 32, 1)         0         
                                                                 
 normalization (Normalizatio  (None, 32, 32, 1)        3         
 n)                                                              
                                                                 
 conv2d (Conv2D)             (None, 30, 30, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 28, 28, 64)        18496     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 14, 14, 64)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 14, 14, 64)        0

## Explore the layers

In [24]:
h5_model.layers

[<keras.layers.preprocessing.image_preprocessing.Resizing at 0x10b38bc90>,
 <keras.layers.preprocessing.normalization.Normalization at 0x12d0d7690>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x12d5d7090>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x12d524250>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x12d591250>,
 <keras.layers.regularization.dropout.Dropout at 0x12d6bf790>,
 <keras.layers.reshaping.flatten.Flatten at 0x12d6bec50>,
 <keras.layers.core.dense.Dense at 0x12d6bd2d0>,
 <keras.layers.regularization.dropout.Dropout at 0x12d6c3ed0>,
 <keras.layers.core.dense.Dense at 0x12d6c38d0>]

In [25]:
layers.Input(shape=input_data.shape)

<KerasTensor: shape=(None, 1, 124, 129, 1) dtype=float32 (created by layer 'input_2')>

In [26]:
# Downsample the input
l1 = layers.Resizing(32, 32)(input_data)
l1.shape

TensorShape([1, 32, 32, 1])

In [27]:
# Normalize
norm_layer = h5_model.layers[1]
l2 = norm_layer(l1.numpy())  # tf.convert_to_tensor(l1 / np.linalg.norm(l1.numpy()), dtype=tf.float32)
l2.shape

TensorShape([1, 32, 32, 1])

## Implement Resizing (bilinear)

In [42]:
# ported from tensorflow/lite/kernels/internal/reference/resize_bilinear.h ComputeInterpolationValues
def compute_interpolation_values(value, scale, input_size, half_pixel_centers=True):
    if half_pixel_centers:
        scaled_value = (value + 0.5) * scale - 0.5
    else:
        scaled_value = value * scale

    scaled_value_floor = float(math.floor(scaled_value))
    lower_bound = int(max(scaled_value_floor, 0))
    upper_bound =  int(min(math.ceil(scaled_value), input_size - 1))

    return scaled_value, lower_bound, upper_bound


# ported from tensorflow/lite/kernels/internal/reference/resize_bilinear.h ResizeBilinear
def resize_bilinear(input_data, output_width, output_height, align_corners=False):
    output_data = np.zeros(input_data.shape, dtype=input_data.dtype)

    batches, input_height, input_width, depth = input_data.shape

    if align_corners and output_height > 1:
        height_scale = (input_height - 1) / (output_height - 1)
    else:
        height_scale = input_height / output_height

    if align_corners and output_width > 1:
        width_scale = (input_width - 1) / (output_width - 1)
    else:
        width_scale = input_width / output_width

    if 'int' in input_data.dtype.name:
        rounding_offset = 0.5
    else:
        rounding_offset = 0.0

    for b in range(batches):
        for y in range(output_height):
            input_y, y0, y1 = compute_interpolation_values(y, height_scale, input_height)

            for x in range(output_width):
                input_x, x0, x1 = compute_interpolation_values(x, width_scale, input_width)

                for c in range(depth):
                    interpolation = input_data[b, y0, x0, c] * (1 - (input_y - y0)) * (1 - (input_x - x0)) + \
                                    input_data[b, y1, x0, c] * (input_y - y0) * (1 - (input_x - x0)) + \
                                    input_data[b, y0, x1, c] * (1 - (input_y - y0)) * (input_x - x0) + \
                                    input_data[b, y1, x1, c] * (input_y - y0) * (input_x - x0) + \
                                    rounding_offset
                    output_data[b, y, x, c] = interpolation
                    #if y == 0 and x < 10: print('Interp %f %d %d, %f %d %d' % (input_y, y0, y1, input_x, x0, x1))

    return np.array(output_data)

In [43]:
np_data.shape

(1, 124, 129, 1)

In [44]:
np_data = input_data.numpy()
resized = resize_bilinear(np_data, 32, 32)
resized[0,0,:10,:]

Interp 1.437500 1 2, 1.515625 1 2
Interp 1.437500 1 2, 5.546875 5 6
Interp 1.437500 1 2, 9.578125 9 10
Interp 1.437500 1 2, 13.609375 13 14
Interp 1.437500 1 2, 17.640625 17 18
Interp 1.437500 1 2, 21.671875 21 22
Interp 1.437500 1 2, 25.703125 25 26
Interp 1.437500 1 2, 29.734375 29 30
Interp 1.437500 1 2, 33.765625 33 34
Interp 1.437500 1 2, 37.796875 37 38


array([[0.00331012],
       [0.0239757 ],
       [0.09051986],
       [0.05873445],
       [0.07851263],
       [0.05929786],
       [0.05793667],
       [0.02979692],
       [0.03618773],
       [0.02978715]], dtype=float32)

In [30]:
# verify with the source of truth
l1[0,0,:10,:]

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[0.00331012],
       [0.0239757 ],
       [0.09051986],
       [0.05873445],
       [0.07851262],
       [0.05929786],
       [0.05793667],
       [0.02979692],
       [0.03618773],
       [0.02978715]], dtype=float32)>

## Explore the normalization layer

In [31]:
norm_layer.mean.numpy(), norm_layer.variance.numpy(), norm_layer.count.numpy()

(array([[[[0.12540944]]]], dtype=float32),
 array([[[[0.58403146]]]], dtype=float32),
 102374400)

In [32]:
norm_output = norm_layer(l1.numpy())
norm_output.shape

TensorShape([1, 32, 32, 1])

In [33]:
norm_output[0,0,:10,:]

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[-0.15976997],
       [-0.13272853],
       [-0.04565387],
       [-0.08724585],
       [-0.06136563],
       [-0.08650862],
       [-0.08828977],
       [-0.12511134],
       [-0.11674879],
       [-0.12512411]], dtype=float32)>

## Verify creating a new normalization layer with provided mean and variance

In [34]:
# Instantiate the normalization layer
mean = 0.12540944
variance = 0.58403146
nl = layers.Normalization(mean=mean, variance=variance)
nl_output = norm_layer(l1.numpy())
nl_output.shape

TensorShape([1, 32, 32, 1])

In [35]:
nl_output[0,0,:10,:]

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[-0.15976997],
       [-0.13272853],
       [-0.04565387],
       [-0.08724585],
       [-0.06136563],
       [-0.08650862],
       [-0.08828977],
       [-0.12511134],
       [-0.11674879],
       [-0.12512411]], dtype=float32)>

## Do the normalization calculation with numpy

In [37]:
npnl_output = (l1.numpy() - mean) / math.sqrt(variance)
npnl_output[0,0,:10,:]

array([[-0.15976997],
       [-0.13272853],
       [-0.04565387],
       [-0.08724585],
       [-0.06136563],
       [-0.08650862],
       [-0.08828977],
       [-0.12511134],
       [-0.11674879],
       [-0.12512411]], dtype=float32)