# Rewrite .npz parameters into vars in .py

so we can use it in Android Studio.

In [1]:
import numpy as np

In [2]:
tmp = np.load("vggish_pca_params.npz")

In [3]:
print(tmp.files)

['pca_means', 'pca_eigen_vectors']


In [5]:
len(tmp['pca_means'])

128

In [9]:
len(tmp['pca_eigen_vectors'][0])

128

In [13]:
f = open("vggish_pca_params.py", "w")

f.write("pca_means = [")
cnt = 0
for i in tmp['pca_means']:
    if cnt == 0:
        f.write(str(i))
    else:
        f.write(',' + str(i))
    cnt += 1
f.write(']\n\n')
print(cnt)


128


In [None]:
f.write("pca_eigen_vectors = [")
cnt1 = 0
for i in tmp['pca_eigen_vectors']:
    if cnt1 != 0:
        f.write(',')
    cnt2 = 0
    for j in i:
        if cnt2 == 0:
            f.write('[' + str(j))
        else:
            f.write(',' + str(j))
        cnt2 += 1
    f.write(']')
    print(cnt2)
    cnt1 += 1
f.write(']\n\n')
print(cnt)
f.close()

# Convert VGGish slim model into .tflite model

so we can use it in Android Studio.

In [1]:
# vggish_slim.py

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import tf_slim as slim

import vggish_params as params


def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')


def load_vggish_slim_checkpoint(session, checkpoint_path):
  """Loads a pre-trained VGGish-compatible checkpoint.

  This function can be used as an initialization function (referred to as
  init_fn in TensorFlow documentation) which is called in a Session after
  initializating all variables. When used as an init_fn, this will load
  a pre-trained checkpoint that is compatible with the VGGish model
  definition. Only variables defined by VGGish will be loaded.

  Args:
    session: an active TensorFlow session.
    checkpoint_path: path to a file containing a checkpoint that is
      compatible with the VGGish model definition.
  """
  # Get the list of names of all VGGish variables that exist in
  # the checkpoint (i.e., all inference-mode VGGish variables).
  with tf.Graph().as_default():
    define_vggish_slim(training=False)
    vggish_var_names = [v.name for v in tf.global_variables()]

  # Get the list of all currently existing variables that match
  # the list of variable names we just computed.
  vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]

  # Use a Saver to restore just the variables selected above.
  saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained',
                         write_version=1)
  saver.restore(session, checkpoint_path)


Instructions for updating:
non-resource variables are not supported in the long term


In [14]:
from __future__ import print_function

import numpy as np
import six
import soundfile
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import vggish_input
import vggish_params
import vggish_postprocess

ckpt = 'vggish_model.ckpt'
examples_batch = vggish_input.wavfile_to_examples('test.wav')
pproc = vggish_postprocess.Postprocessor()

with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, ckpt)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    postprocessed_batch = pproc.postprocess(embedding_batch)
    converter = tf.lite.TFLiteConverter.from_session(sess, [features_tensor], [embedding_tensor])
    tflite_model = converter.convert()
    open("saved_model/vggish_feature_extraction_model.tflite", "wb").write(tflite_model)

Jerry vggish_input.py: start wavfile_to_examples()
44100
[[789 789]
 [507 507]
 [154 154]
 ...
 [  0   0]
 [  0   0]
 [  0   0]]
(432488, 2)
Jerry vggish_input.py: start waveform_to_examples()
Jerry vggish_input.py: before resample
after resample
(156912,)
Jerry vggish_input.py: before log_mel
Jerry vggish_input.py: after log_mel
Jerry vggish_input.py: finish waveform_to_examples()
(10, 96, 64)
INFO:tensorflow:Restoring parameters from vggish_model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 18 variables.
INFO:tensorflow:Converted 18 variables to const ops.


In [15]:
# The original, "should be", output after post-process
postprocessed_batch

array([[151,  26, 146, ..., 102, 197, 255],
       [154,  26, 168, ...,   0, 138, 255],
       [155,  25, 164, ...,  79, 131, 255],
       ...,
       [175,   8, 146, ..., 147,  69, 255],
       [175,   8, 146, ..., 147,  69, 255],
       [175,   8, 146, ..., 147,  69, 255]], dtype=uint8)

In [26]:
# Test .tflite model
# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="saved_model/vggish_feature_extraction_model.tflite")
interpreter.allocate_tensors()
# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Test the model on random input data.
input_shape = input_details[0]['shape']
input_data = np.array([examples_batch[0]], dtype=np.float32)
interpreter.set_tensor(input_details[0]['index'], input_data)

interpreter.invoke()

# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.
output_data = interpreter.get_tensor(output_details[0]['index'])
print(output_data)

[[0.         0.51773083 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.6152872  0.         0.10851269 0.
  0.         0.11004636 0.         0.         0.06447921 0.
  0.         0.         0.02334112 0.36665872 0.15915178 0.
  0.         0.         0.         0.38003775 0.11620829 0.31883034
  0.         0.         0.         1.068456   0.35640162 0.
  0.         0.         0.8330307  0.         0.         0.
  0.         0.90416443 0.         0.19184536 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.3652949  0.
  0.         1.1452391  0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.24345502
  0.32170972 0.         0.         0.05670428 0.         0.
  0.         0.6894784  0.         0.13961355 0.2678038  0.
  0.         0.24759701 0.         1.3265727  0.         0.94142413
  0.         0. 

In [27]:
pproc.postprocess(output_data)

array([[151,  26, 146,  46, 253,  83, 140,  92, 141, 210, 144,  80, 140,
        234, 173, 102,  48, 181, 129, 147,  29, 211,  90, 111, 124, 164,
        220, 187,  41,  23, 126, 152, 141, 103,  52, 141, 160,  57,  76,
        184, 115,  97,   0, 200,   0, 101,  65, 152, 112, 255, 255,  43,
         93, 147, 144, 149, 113,   0, 110, 195, 222,  48,  22, 255, 166,
        146, 119, 156, 113,   0, 241, 101, 255, 128, 138, 128,  99, 203,
        158, 232,  11, 246, 184, 140, 175, 128, 255, 198,  94,  92,   0,
        255, 203, 187,  79, 200,   0,   0, 255, 147, 255,   0, 225,  59,
          0, 255, 182, 135,   0, 236, 230, 239,   0,  83, 145,   0, 126,
         81,   0, 150,  62,   8,   0, 146, 197, 102, 197, 255]],
      dtype=uint8)