In [2]:
import tensorflow as tf
import numpy as np
from keras.models import Model
from keras.layers import Conv1D, MaxPooling1D, Dense, Lambda, concatenate, Input, GlobalMaxPooling1D, GlobalAveragePooling1D
import keras.backend as K




Using TensorFlow backend.


In [3]:
num_frames = 599
num_freq_bins = 128
dummy_data = np.random.random((num_frames, num_freq_bins))
num_conv_filters_1 = 256
kernel_size = 4

In [4]:
def l2_norm(x):
    x = x ** 2
    x = K.sum(x, axis=1)
    x = K.sqrt(x)
    return x

In [5]:
class AudioCNNModel():
    def __init__(self, num_frames, num_freq_bins, num_conv_filters1, pool_size_1, kernel_size):
        
        self.num_frames = num_frames
        self.num_freq_bins = num_freq_bins
        self.num_conv_filters1 = num_conv_filters1
        self.pool_size1 = pool_size_1
        self.kernel_size = kernel_size
        self.model_input = Input(shape=(num_frames, num_freq_bins))
        
        x = Conv1D(filters=self.num_conv_filters1, kernel_size=self.kernel_size, input_shape=(self.num_frames, self.num_freq_bins))(self.model_input)
        x = MaxPooling1D(pool_size=self.pool_size1)(x)
        x = Conv1D(filters=256, kernel_size=self.kernel_size)(x)
        x = MaxPooling1D(pool_size=2)(x)
        x = Conv1D(filters=512, kernel_size=self.kernel_size)(x)
        x = MaxPooling1D(pool_size=2)(x)

        #temporal pooling, L2, mean
        max_layer = GlobalMaxPooling1D(data_format='channels_last')(x)
        mean_layer = GlobalAveragePooling1D(data_format='channels_last')(x)
        L2_layer = Lambda(lambda x: l2_norm(x))(x)
        #TODO:concatenate
        
        x = concatenate([max_layer, mean_layer, L2_layer])
        #End
        x = Dense(2048, activation='relu')(x)
        x = Dense(2048, activation='relu')(x)
        latent_factors = Dense(50)(x)
        self.net = Model(inputs=self.model_input, outputs=latent_factors)
        
        


In [6]:
model = AudioCNNModel(num_frames, num_freq_bins, num_conv_filters_1, 4, kernel_size)
model.net.predict(np.array([dummy_data])).shape

Instructions for updating:
Colocations handled automatically by placer.


(1, 50)

In [7]:
sample = np.loadtxt(open("mfcc.csv", "rb"), delimiter=",")
print(sample.shape)

(588, 12)


In [8]:
model = AudioCNNModel(588, 12, num_conv_filters_1, 4, kernel_size)
model.net.predict(np.array([sample]))

array([[-1.3768783e+01,  5.3400726e+00,  1.7182302e+01,  4.7254562e-03,
         1.1371862e+01, -1.1489881e+01, -1.1623225e+01, -5.3368160e+01,
         2.8220667e+01,  2.1207111e+00,  3.0441160e+01, -2.2382311e+01,
         2.0941704e+01, -2.5579355e+01,  4.6951742e+00, -9.3095657e+01,
        -3.8151526e+00,  2.1477886e+01, -3.1216588e+00, -1.1774329e+01,
         3.5614014e+01,  7.8140297e+00, -1.8039911e+00, -1.8990763e+01,
         6.0609093e+01, -2.6195520e+01,  2.7206783e+01, -2.3290064e+01,
         1.4504442e+01, -2.5367855e+01, -6.3717075e+01,  1.2852405e+01,
        -1.7351482e+01,  1.2951219e+01, -2.1215942e+01,  4.4176807e+01,
         1.6976128e+01, -1.8742142e+01, -2.6397219e+00, -5.5945778e+01,
         8.1725750e+00, -2.6101841e+01,  4.0300927e+00, -4.7360023e+01,
        -7.8776133e-01,  5.5466011e+01, -1.3403375e+01, -3.0053848e+01,
         1.3307592e+01,  5.9734659e+00]], dtype=float32)

In [13]:
f = open("chroma.npy", "rb")
new_l = np.load(f)
print(new_l.shape)
f.close()


(7457, 300, 12)


In [15]:
model = AudioCNNModel(300, 12, num_conv_filters_1, 4, kernel_size)
model.net.predict(np.array([new_l[0]]))

array([[-10.795193 , -22.549454 ,   9.735798 ,  -8.500256 ,  -5.311801 ,
          4.7889986, -25.51222  , -10.607115 ,   5.5248623,  -0.1869241,
         -5.0444384,   1.5588579,   1.2705004,   3.774472 ,   8.58127  ,
         -5.629212 , -10.317668 ,  33.51322  ,  -5.630941 , -17.618286 ,
         -6.794819 ,   2.2527916, -11.753466 ,  19.634468 ,  -9.071924 ,
         10.407949 ,   2.977954 ,  -7.0899405,  -7.902363 , -20.436295 ,
         -6.8034143, -17.229927 ,   4.1288157,  -1.3273693,   3.5205586,
        -11.960981 , -45.609367 , -14.118712 ,   1.3335799, -20.939522 ,
         18.94703  ,   8.75972  ,  -9.244462 , -13.868502 , -30.433079 ,
          8.901087 ,  -2.844258 ,   4.8732414,   1.0633532,  15.935471 ]],
      dtype=float32)