- A key idea here is the use of raw audio.
- An important advantage of this is that any type of audio - and indeed by extension, any type of time series - can potentially be modelled.

In [141]:
import tensorflow as tf

In [203]:
class SimpleWaveNet(tf.keras.models.Model):
    def __init__(self, model):
        super(SimpleWaveNet, self).__init__()
        self.model = model
        
        
    def masked_forward(self, x, mask_in, mask_out, cond=None):
        batch_size, time_steps, tokens = tf.shape(x)
        
        # (batch_size, num_steps, time_steps, tokens)
        x_masked = x[:, None] * mask_in[None,...,None]
        # (batch_size * num_steps, time_steps, tokens)
        x_masked = tf.reshape(x_masked, [-1, time_steps, tokens])

        # (batch_size * num_steps, time_steps, tokens)
        output = self.model(x_masked, cond) 
        # (batch_size, num_steps, time_steps, tokens)
        output = tf.reshape(output, (batch_size, -1, time_steps, tokens))
        output = output * mask_out[None,...,None]
        return output
        
    def __call__(self, x, training, cond=None):
        # x = (batch_size, time_steps, tokens)
        batch_size, time_steps, tokens = tf.shape(x)
        output_steps = tf.range(2, time_steps + 1)
        
        # (time_steps - 1, time_steps)
        mask_in = tf.sequence_mask(
            lengths=output_steps - 1,
            maxlen=time_steps,
            dtype=tf.float32
        )
        # (time_steps - 1, time_steps)
        mask_out = tf.sequence_mask(
                lengths=output_steps,
                maxlen=time_steps,
                dtype=tf.float32
        )
        
        if training: 
            # (batch_size, time_steps - 1, time_steps, tokens)
            outputs = self.masked_forward(x, mask_in, mask_out, cond)
            
            # (batch_size, time_steps, time_steps, tokens)
            return tf.concat([x[:, None] * mask_in[None,0,:,None], outputs], axis=1)

        else:
            outputs = [x]
            # (time_steps,), (time_steps,)
            
            for mi, mo in zip(mask_in, mask_out):
                # (batch_size, 1, time_steps, tokens)
                output = self.masked_forward(outputs[-1], mi, mo, cond)
                outputs.append(tf.squeeze(output, 1))
                
            # (batch_size, time_steps, time_steps, tokens)
            return tf.stack(outputs, axis=1)    

In [204]:
model = tf.keras.layers.Conv1D(kernel_size=3, filters=1, padding='same')
wavenet = SimpleWaveNet(model)


In [208]:
x = tf.random.uniform([6])
x

<tf.Tensor: id=8392, shape=(6,), dtype=float32, numpy=
array([0.42109227, 0.49568915, 0.08372366, 0.81927514, 0.15176475,
       0.8476447 ], dtype=float32)>

In [221]:
wavenet(x[None, ..., None], True)[0,...,0].numpy()!= 0

array([[ True, False, False, False, False, False],
       [ True,  True, False, False, False, False],
       [ True,  True,  True, False, False, False],
       [ True,  True,  True,  True, False, False],
       [ True,  True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True,  True]])

In [220]:
wavenet((x * tf.sequence_mask(1, 6, dtype=tf.float32))[None, ..., None], False)[0,...,0].numpy() != 0

array([[ True, False, False, False, False, False],
       [ True,  True, False, False, False, False],
       [ True,  True,  True, False, False, False],
       [ True,  True,  True,  True, False, False],
       [ True,  True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True,  True]])

This was found to work better than ReLU I think.

In [None]:
class GatedActivationUnit(tf.keras.models.Model):
    def __init__(self, **kwargs):
        super(GatedActivationUnit, self).__init__()
        self.conv_filter = tf.keras.layers.Conv1D(**kwargs['conv_filter'], activation='tanh')
        self.conv_gate = tf.keras.layers.Conv1D(**kwargs['conv_gate'], activation='sigmoid')
        
    def __call__(self, x):
        return self.conv_filter(x) * self.conv_gate(x)
        
    