This colab creates an animation of what the [YAMNet](https://tfhub.dev/google/yamnet/1) model does internally with it's framing and hops

In [None]:
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub
import soundfile as sf
from matplotlib.animation import FuncAnimation
import math

import matplotlib.pylab as plt

Download the audio image

In [None]:

testing_wav_file_name = tf.keras.utils.get_file('miaow_16k.wav',
                                                'https://storage.googleapis.com/audioset/miaow_16k.wav',
                                                cache_dir='./',
                                                cache_subdir='test_data')

print(testing_wav_file_name)

In [None]:
wav_data, sr = sf.read(testing_wav_file_name, dtype=np.int16)
duration = len(wav_data) / sr

Define some of the parameters for the animation

In [None]:
window_size = 0.96
window_hop = 0.48
window_size_number = window_size * sr
window_hop_number = window_hop * sr

max = wav_data.max()
min = wav_data.min()
height = max - min

size_wav_data = len(wav_data)
n = math.ceil(size_wav_data / window_hop_number)
zeros = np.zeros(int(n * window_hop_number) - size_wav_data)
wav_data_complete = np.append(wav_data,  zeros)
len(zeros), len(wav_data), n, len(wav_data_complete)

In [None]:
window_size_number, window_hop_number

In [None]:
from matplotlib import rc
from matplotlib import animation
from matplotlib.patches import Rectangle
rc('animation', html='jshtml')

fig = plt.figure(figsize=(12, 4))
ax = plt.subplot2grid((1, 5), (0, 0), rowspan=1, colspan=4)
ax2 = plt.subplot2grid((1, 5), (0, 4), rowspan=1, colspan=1)
# line, = ax.plot(wav_data_complete)

positions = (0, 1*sr, 2*sr, 3*sr, 4*sr, 5*sr, 6*sr)
labels = ("0s", '1s', '2s', '3s', '4s', '5s', '6s')

plt.xticks(positions, labels)

def animateL(i):

  # draw the main view and move the yellow highlight
  ax.clear()

  line, = ax.plot(wav_data_complete, 'b')
  step = i*window_hop*sr
  ax.add_patch(Rectangle((step, min), window_size_number, height, label="0.96s",
                      alpha=1, edgecolor='k', linewidth=1, facecolor='y'))
  ax.text(step, max + 300, '  0.96s')
  ax.set_ylim(bottom=min + (min*.15), top=max + (max*.2))
  ax.set_xticks(positions)
  ax.set_xticklabels(labels)
  ax.set_title(f'Frame: {i:2} ({i*0.48:.2f}s)')

  # work on the detail view (on the right)
  ax2.clear()
  pos1 = int(step)
  pos2 = int(step+window_size_number)
  
  wav_split = wav_data_complete[pos1:pos2]
  middle = int(len(wav_split) / 2)

  ax2.plot(wav_split, 'b')
  ax2.set_title(f'{i*0.48:.2f}s - {(i+1)*0.48:.2f}s')
  ax2.set_ylim(bottom=min + (min*.15), top=max + (max*.2))  
  ax2.yaxis.set_ticks_position('right')
  ax2.set_xticks([])
  
  return line,

frames = math.floor(duration / window_hop)
anim = FuncAnimation(fig, animateL, frames=frames, interval=500, blit=True)

writergif = animation.PillowWriter() 
anim.save('./yamnet_animation.gif', writer=writergif)

anim