# Implement Conv2D Layers

In [197]:
import pathlib
import numpy as np
import math
import h5py
from datetime import datetime

import tensorflow as tf 
from tensorflow.keras import layers

In [3]:
data_dir = pathlib.Path('data')

In [72]:
def get_waveform(wavfile):
    x = tf.io.read_file(str(wavfile))
    x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000)
    return tf.squeeze(x, axis=-1)


def get_spectrogram(waveform):
    # Convert the waveform to a spectrogram via a STFT
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)

    # Obtain the magnitude of the STFT
    spectrogram = tf.abs(spectrogram)

    # Add a 'channels' dimension, so that the spectrogram can be used as an
    # image-like input data w/ convolution layers, which expect shape
    # (batch_size, height, width, channels)
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

def relu(x):
    return x.clip(0.0)

## Get the input data

In [6]:
waveform = get_waveform(data_dir/'test/h_yes.wav')
spec = get_spectrogram(waveform)
input_data = spec[tf.newaxis,...]
input_data.shape, input_data[0,0,:10,:]

(TensorShape([1, 124, 129, 1]),
 <tf.Tensor: shape=(10, 1), dtype=float32, numpy=
 array([[0.00021577],
        [0.00242205],
        [0.00827248],
        [0.01238501],
        [0.01051275],
        [0.01144113],
        [0.01872324],
        [0.02427843],
        [0.04116756],
        [0.09470174]], dtype=float32)>)

## Load the model

In [7]:
h5_model = tf.keras.models.load_model('simple_audio.h5')
h5_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing (Resizing)         (None, 32, 32, 1)         0         
                                                                 
 normalization (Normalizatio  (None, 32, 32, 1)        3         
 n)                                                              
                                                                 
 conv2d (Conv2D)             (None, 30, 30, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 28, 28, 64)        18496     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 14, 14, 64)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 14, 14, 64)        0

## Downsample and Normalize

In [41]:
# Downsample the input
l1 = h5_model.layers[0]
l1_out = l1(input_data)
l1_out[0,:3,:3,:]

<tf.Tensor: shape=(3, 3, 1), dtype=float32, numpy=
array([[[0.00331012],
        [0.0239757 ],
        [0.09051986]],

       [[0.0023394 ],
        [0.01505184],
        [0.04525724]],

       [[0.00353799],
        [0.04326304],
        [0.53528124]]], dtype=float32)>

In [82]:
# Normalize
l2 = h5_model.layers[1]
l2_out = l2(l1_out.numpy())
l2_out_1 = l2_out[0,0:3,0:3,:].numpy()
l2_out_1

array([[[-0.15976997],
        [-0.13272853],
        [-0.04565387]],

       [[-0.16104017],
        [-0.14440563],
        [-0.10488112]],

       [[-0.15947178],
        [-0.10749058],
        [ 0.5363273 ]]], dtype=float32)

## Conv2D

### Get the filter and bias

In [76]:
l3 = h5_model.layers[2]
filter_1 = l3.weights[0].numpy()[:,:,:,0]
bias_1 = l3.weights[1].numpy()[0]
filter_1, bias_1

(array([[[ 0.23206365],
         [ 0.0625558 ],
         [-0.01330692]],
 
        [[ 0.01292816],
         [ 0.19025595],
         [-0.04498325]],
 
        [[-0.30813888],
         [-0.15491289],
         [ 0.21284764]]], dtype=float32),
 0.05159658)

### Run the tensorflow layer and view results

In [200]:
# run the tf conv2d layer and print the channel values for the first entry
t = datetime.now()
l3_out = l3(l2_out.numpy())
dt = datetime.now() - t
print('Time elapsed: %s' % dt)
print(l3_out.numpy().shape)
l3_out[0,0,0,:]

Time elapsed: 0:00:00.001852
(1, 30, 30, 32)


<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.16193339, 0.05731045, 0.        , 0.        , 0.        ,
       0.07186232, 0.01496371, 0.        , 0.        , 0.        ,
       0.        , 0.12110282, 0.07639576, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.08606026, 0.        ,
       0.08105926, 0.13700518, 0.        , 0.        , 0.        ,
       0.01572211, 0.00423875, 0.        , 0.08994323, 0.        ,
       0.07543847, 0.        ], dtype=float32)>

### Calculate the first entry's value

In [131]:
# calculate the first value
np.sum(l2_out_1 * filter_1) + bias_1

0.16193339

### Calculate the channel values for the first entry

In [141]:
# calculate the channel values for the first entry
channels = l3.weights[0].numpy().shape[-1]
test_output = np.zeros(channels)
for i in range(channels):
    filter = l3.weights[0].numpy()[:,:,:,i]
    bias = l3.weights[1].numpy()[i]
    test_output[i] = relu(np.sum(l2_out_1 * filter) + bias)
test_output

array([0.16193339, 0.05731045, 0.        , 0.        , 0.        ,
       0.07186231, 0.01496371, 0.        , 0.        , 0.        ,
       0.        , 0.12110282, 0.07639575, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.08606026, 0.        ,
       0.08105926, 0.13700518, 0.        , 0.        , 0.        ,
       0.01572211, 0.00423876, 0.        , 0.08994324, 0.        ,
       0.07543847, 0.        ])

### Implement conv2d

In [203]:
# expects a 4 dimensional input (groups, rows, cols, 1)
# and filters (rows, cols, 1, channels)
# biases is 1 dimensional matching the number of channels or None
def conv2d(input, filters, biases):
    output_shape = list(input.shape)
    # the edges are not padded, so only points that the filters can cover are included
    window_row_size = filters.shape[0]
    window_col_size = filters.shape[1]
    window_row_margin = window_row_size // 2
    window_col_margin = window_col_size // 2
    output_shape[1] -= window_row_margin * 2
    output_shape[2] -= window_col_margin * 2
    # number of channels
    channels = filters.shape[-1]
    output_shape[-1] = channels
    output = np.zeros(output_shape)

    groups = input.shape[0]
    rows = input.shape[1]
    cols = input.shape[2]

    for g in range(groups):
        for r in range(rows - window_row_size + 1):  # skip unpadded edges
            for c in range(cols - window_col_size + 1):  # skip unpadded edges

                input_window = input[g, r:r+window_row_size, c:c+window_col_size, :]

                for ch in range(channels):
                    filter = filters[:,:,:,ch]
                    try:
                        bias = biases[ch]
                    except:
                        bias = 0.0

                    val = relu(
                        np.sum(
                            input_window \
                            * filter)
                        + bias)

                    output[g,r,c,ch] = val

    return output

### Test the implementation

In [204]:
t = datetime.now()
myl3out = conv2d(l2_out.numpy(), l3.weights[0].numpy(), l3.weights[1].numpy())
dt = datetime.now() - t
print('Time elapsed: %s' % dt)
print(myl3out.shape)
myl3out[0,0,0,:]

Time elapsed: 0:00:00.598863
(1, 30, 30, 32)


array([0.16193339, 0.05731045, 0.        , 0.        , 0.        ,
       0.07186231, 0.01496371, 0.        , 0.        , 0.        ,
       0.        , 0.12110282, 0.07639575, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.08606026, 0.        ,
       0.08105926, 0.13700518, 0.        , 0.        , 0.        ,
       0.01572211, 0.00423876, 0.        , 0.08994324, 0.        ,
       0.07543847, 0.        ])

In [205]:
# verify values
def verify_conv2d(out1, out2, tolerance=1e-5):
    shapes_equal = out1.shape == out2.shape
    print('Shapes match: %s' % str(shapes_equal))
    if not shapes_equal:
        return

    print('Values match: %s' % str(np.all(l3_out - myl3out < tolerance)))

In [206]:
verify_conv2d(l3_out.numpy(), myl3out)

Shapes match: True
Values match: True


## The next Conv2D layer 

### Run the tensorflow layer and view results

In [199]:
l4 = h5_model.layers[3]
t = datetime.now()
l4_out = l4(l3_out.numpy())
dt = datetime.now() - t
print('Time elapsed: %s' % dt)
print(l4_out.shape)
l4_out[0,0,0,:]

Time elapsed: 0:00:00.007967
(1, 28, 28, 64)


<tf.Tensor: shape=(64,), dtype=float32, numpy=
array([0.        , 0.        , 0.        , 0.        , 0.49769944,
       0.        , 0.15165518, 0.        , 0.31784678, 0.42159253,
       0.        , 0.3958354 , 0.        , 0.        , 0.09659711,
       0.        , 0.        , 0.        , 0.        , 0.10127787,
       0.2358518 , 0.        , 0.13675866, 0.15558577, 0.18190861,
       0.        , 0.09301445, 0.08088657, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.25600177, 0.        , 0.4070488 , 0.        ,
       0.        , 0.        , 0.26512337, 0.        , 0.        ,
       0.        , 0.15896116, 0.        , 0.19394155, 0.25666392,
       0.        , 0.06679156, 0.        , 0.14209348, 0.27046356,
       0.        , 0.        , 0.02366456, 0.13287333, 0.        ,
       0.        , 0.        , 0.        , 0.00255454], dtype=float32)>

### Test the implementation

In [207]:
t = datetime.now()
myl4out = conv2d(myl3out, l4.weights[0].numpy(), l4.weights[1].numpy())
dt = datetime.now() - t
print('Time elapsed: %s' % dt)
print(myl4out.shape)
myl4out[0,0,0,:]

Time elapsed: 0:00:01.096347
(1, 28, 28, 64)


array([0.        , 0.        , 0.        , 0.        , 0.49769943,
       0.        , 0.15165517, 0.        , 0.31784674, 0.42159255,
       0.        , 0.39583546, 0.        , 0.        , 0.09659713,
       0.        , 0.        , 0.        , 0.        , 0.10127787,
       0.23585178, 0.        , 0.13675864, 0.15558576, 0.1819086 ,
       0.        , 0.09301444, 0.08088656, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.25600182, 0.        , 0.40704878, 0.        ,
       0.        , 0.        , 0.26512334, 0.        , 0.        ,
       0.        , 0.15896116, 0.        , 0.19394159, 0.25666391,
       0.        , 0.06679156, 0.        , 0.14209349, 0.27046354,
       0.        , 0.        , 0.02366454, 0.13287334, 0.        ,
       0.        , 0.        , 0.        , 0.00255456])

In [208]:
verify_conv2d(l4_out.numpy(), myl4out)

Shapes match: True
Values match: True
