# Implement Resizing and Normalization Layers

In [1]:
import pathlib
import numpy as np
import math

import tensorflow as tf 
from tensorflow.keras import layers

2024-04-28 10:56:00.571210: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_dir = pathlib.Path('data')

In [3]:
def get_waveform(wavfile):
    x = tf.io.read_file(str(wavfile))
    x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000)
    return tf.squeeze(x, axis=-1)


def get_spectrogram(waveform):
    # Convert the waveform to a spectrogram via a STFT
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)

    # Obtain the magnitude of the STFT
    spectrogram = tf.abs(spectrogram)

    # Add a 'channels' dimension, so that the spectrogram can be used as an
    # image-like input data w/ convolution layers, which expect shape
    # (batch_size, height, width, channels)
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

## Get the input data

In [4]:
waveform = get_waveform(data_dir/'yes.wav')
spec = get_spectrogram(waveform)
input_data = spec[tf.newaxis,...]
input_data.shape, input_data[0,0,:10,:]

(TensorShape([1, 124, 129, 1]),
 <tf.Tensor: shape=(10, 1), dtype=float32, numpy=
 array([[0.00087561],
        [0.00134371],
        [0.00557508],
        [0.01203688],
        [0.01582851],
        [0.01979508],
        [0.03313684],
        [0.05369601],
        [0.05009932],
        [0.03737277]], dtype=float32)>)

## Load the model

In [6]:
h5_model = tf.keras.models.load_model('simple-sr.h5')
h5_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing (Resizing)         (None, 32, 32, 1)         0         
                                                                 
 normalization (Normalizatio  (None, 32, 32, 1)        3         
 n)                                                              
                                                                 
 conv2d (Conv2D)             (None, 30, 30, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 28, 28, 64)        18496     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 14, 14, 64)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 14, 14, 64)        0

## Explore the layers

In [7]:
h5_model.layers

[<keras.layers.preprocessing.image_preprocessing.Resizing at 0x12c45ded0>,
 <keras.layers.preprocessing.normalization.Normalization at 0x12b4efdd0>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x12b4c5190>,
 <keras.layers.convolutional.conv2d.Conv2D at 0x12b4e17d0>,
 <keras.layers.pooling.max_pooling2d.MaxPooling2D at 0x12a7ca7d0>,
 <keras.layers.regularization.dropout.Dropout at 0x12c481e50>,
 <keras.layers.reshaping.flatten.Flatten at 0x12ba6a3d0>,
 <keras.layers.core.dense.Dense at 0x12c465050>,
 <keras.layers.regularization.dropout.Dropout at 0x12c4a4050>,
 <keras.layers.core.dense.Dense at 0x12c4bc190>]

In [8]:
layers.Input(shape=input_data.shape)

<KerasTensor: shape=(None, 1, 124, 129, 1) dtype=float32 (created by layer 'input_1')>

In [9]:
# Downsample the input
l1 = layers.Resizing(32, 32)(input_data)
l1.shape

TensorShape([1, 32, 32, 1])

In [10]:
# Normalize
norm_layer = h5_model.layers[1]
l2 = norm_layer(l1.numpy())  # tf.convert_to_tensor(l1 / np.linalg.norm(l1.numpy()), dtype=tf.float32)
l2.shape

TensorShape([1, 32, 32, 1])

## Implement Resizing (bilinear)

In [11]:
# ported from tensorflow/lite/kernels/internal/reference/resize_bilinear.h ComputeInterpolationValues
def compute_interpolation_values(value, scale, input_size, half_pixel_centers=True):
    if half_pixel_centers:
        scaled_value = (value + 0.5) * scale - 0.5
    else:
        scaled_value = value * scale

    scaled_value_floor = float(math.floor(scaled_value))
    lower_bound = int(max(scaled_value_floor, 0))
    upper_bound =  int(min(math.ceil(scaled_value), input_size - 1))

    return scaled_value, lower_bound, upper_bound


# ported from tensorflow/lite/kernels/internal/reference/resize_bilinear.h ResizeBilinear
def resize_bilinear(input_data, output_width, output_height, align_corners=False):
    output_data = np.zeros(input_data.shape, dtype=input_data.dtype)

    batches, input_height, input_width, depth = input_data.shape

    if align_corners and output_height > 1:
        height_scale = (input_height - 1) / (output_height - 1)
    else:
        height_scale = input_height / output_height

    if align_corners and output_width > 1:
        width_scale = (input_width - 1) / (output_width - 1)
    else:
        width_scale = input_width / output_width

    if 'int' in input_data.dtype.name:
        rounding_offset = 0.5
    else:
        rounding_offset = 0.0

    for b in range(batches):
        for y in range(output_height):
            input_y, y0, y1 = compute_interpolation_values(y, height_scale, input_height)

            for x in range(output_width):
                input_x, x0, x1 = compute_interpolation_values(x, width_scale, input_width)

                for c in range(depth):
                    interpolation = input_data[b, y0, x0, c] * (1 - (input_y - y0)) * (1 - (input_x - x0)) + \
                                    input_data[b, y1, x0, c] * (input_y - y0) * (1 - (input_x - x0)) + \
                                    input_data[b, y0, x1, c] * (1 - (input_y - y0)) * (input_x - x0) + \
                                    input_data[b, y1, x1, c] * (input_y - y0) * (input_x - x0) + \
                                    rounding_offset
                    output_data[b, y, x, c] = interpolation
                    #if y == 0 and x < 10: print('Interp %f %d %d, %f %d %d' % (input_y, y0, y1, input_x, x0, x1))

    return np.array(output_data)

In [14]:
np_data = input_data.numpy()
resized = resize_bilinear(np_data, 32, 32)
resized[0,0,:10,:]

array([[0.00606233],
       [0.02537882],
       [0.09502278],
       [0.08039056],
       [0.0536858 ],
       [0.06792126],
       [0.03798125],
       [0.06584655],
       [0.08855249],
       [0.11348754]], dtype=float32)

In [15]:
# verify with the saved model
l1[0,0,:10,:]

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[0.00606233],
       [0.02537882],
       [0.09502278],
       [0.08039056],
       [0.0536858 ],
       [0.06792126],
       [0.03798125],
       [0.06584655],
       [0.08855248],
       [0.11348754]], dtype=float32)>

## Explore the normalization layer

In [16]:
norm_layer.mean.numpy(), norm_layer.variance.numpy(), norm_layer.count.numpy()

(array([[[[0.12540944]]]], dtype=float32),
 array([[[[0.58403146]]]], dtype=float32),
 102374400)

In [17]:
norm_output = norm_layer(l1.numpy())
norm_output.shape

TensorShape([1, 32, 32, 1])

In [18]:
norm_output[0,0,:10,:]

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[-0.15616861],
       [-0.13089252],
       [-0.0397617 ],
       [-0.05890831],
       [-0.09385214],
       [-0.0752247 ],
       [-0.11440194],
       [-0.0779395 ],
       [-0.04822823],
       [-0.0156001 ]], dtype=float32)>

## Verify creating a new normalization layer with provided mean and variance

In [19]:
# Instantiate the normalization layer
mean = 0.12540944
variance = 0.58403146
nl = layers.Normalization(mean=mean, variance=variance)
nl_output = norm_layer(l1.numpy())
nl_output.shape

TensorShape([1, 32, 32, 1])

In [20]:
nl_output[0,0,:10,:]

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[-0.15616861],
       [-0.13089252],
       [-0.0397617 ],
       [-0.05890831],
       [-0.09385214],
       [-0.0752247 ],
       [-0.11440194],
       [-0.0779395 ],
       [-0.04822823],
       [-0.0156001 ]], dtype=float32)>

## Do the normalization calculation with numpy

In [21]:
npnl_output = (l1.numpy() - mean) / math.sqrt(variance)
npnl_output[0,0,:10,:]

array([[-0.15616861],
       [-0.13089252],
       [-0.0397617 ],
       [-0.05890831],
       [-0.09385214],
       [-0.0752247 ],
       [-0.11440194],
       [-0.0779395 ],
       [-0.04822823],
       [-0.0156001 ]], dtype=float32)