In [2]:
import tensorflow as tf
import numpy as np
from keras.models import Model
from keras.layers import Conv1D, MaxPooling1D, Dense, Lambda, concatenate, Input, GlobalMaxPooling1D, GlobalAveragePooling1D
import keras.backend as K




Using TensorFlow backend.


In [3]:
num_frames = 599
num_freq_bins = 128
dummy_data = np.random.random((num_frames, num_freq_bins))
num_conv_filters_1 = 256
kernel_size = 4

In [4]:
def l2_norm(x):
    x = x ** 2
    x = K.sum(x, axis=1)
    x = K.sqrt(x)
    return x

In [5]:
class AudioCNNModel():
    def __init__(self, num_frames, num_freq_bins, num_conv_filters1, pool_size_1, kernel_size):
        
        self.num_frames = num_frames
        self.num_freq_bins = num_freq_bins
        self.num_conv_filters1 = num_conv_filters1
        self.pool_size1 = pool_size_1
        self.kernel_size = kernel_size
        self.model_input = Input(shape=(num_frames, num_freq_bins))
        
        x = Conv1D(filters=self.num_conv_filters1, kernel_size=self.kernel_size, input_shape=(self.num_frames, self.num_freq_bins))(self.model_input)
        x = MaxPooling1D(pool_size=self.pool_size1)(x)
        x = Conv1D(filters=256, kernel_size=self.kernel_size)(x)
        x = MaxPooling1D(pool_size=2)(x)
        x = Conv1D(filters=512, kernel_size=self.kernel_size)(x)
        x = MaxPooling1D(pool_size=2)(x)

        #temporal pooling, L2, mean
        max_layer = GlobalMaxPooling1D(data_format='channels_last')(x)
        mean_layer = GlobalAveragePooling1D(data_format='channels_last')(x)
        L2_layer = Lambda(lambda x: l2_norm(x))(x)
        #TODO:concatenate
        
        x = concatenate([max_layer, mean_layer, L2_layer])
        #End
        x = Dense(2048, activation='relu')(x)
        x = Dense(2048, activation='relu')(x)
        latent_factors = Dense(50)(x)
        self.net = Model(inputs=self.model_input, outputs=latent_factors)
        
        


In [6]:
model = AudioCNNModel(num_frames, num_freq_bins, num_conv_filters_1, 4, kernel_size)
model.net.predict(np.array([dummy_data])).shape

Instructions for updating:
Colocations handled automatically by placer.


(1, 50)

In [14]:
sample = np.loadtxt(open("mfcc.csv", "rb"), delimiter=",")
print(sample.shape)

(588, 12)


In [18]:
model = AudioCNNModel(588, 12, num_conv_filters_1, 4, kernel_size)
model.net.predict(np.array([sample]))

array([[-1.9806973e+01, -5.3713846e-01, -4.7246750e+01,  2.5584368e+01,
        -9.9681311e+00,  1.2044224e+01, -2.7165415e+01, -2.3274740e+01,
         1.6299646e+00, -1.4168367e+00,  1.4296337e+01,  1.9536862e+00,
        -2.7003475e+01,  9.3044968e+00, -3.7653595e-01, -3.0715734e+01,
         4.0956821e+01, -1.1072885e+01, -1.9723230e+01, -1.0651125e+01,
         3.1175880e+01,  1.3052781e+01, -8.3524621e-01, -1.4944993e+01,
        -5.1835122e+00, -2.2642338e+01, -3.1236853e+01,  1.3683222e+01,
         3.3727884e-02, -3.0168636e+01, -2.7883751e+01, -2.1144281e+01,
        -1.5089521e+00, -3.6744213e+01, -4.7313204e+00,  2.1528519e+01,
        -1.4222992e+01,  1.3659893e+01, -2.0891195e+01,  7.4002576e-01,
         1.3385822e+01, -2.2731121e+01,  4.0953937e+00,  1.6503340e+01,
         1.5168604e+01, -2.8626774e+01,  1.7193834e+01, -1.0743925e+01,
        -1.0019341e+01,  1.5123607e+01]], dtype=float32)