In [1]:
import librosa
from sequential_model.layers import *
from sequential_model.convnet import * 
from sequential_model.solver import *
import numpy as np

### Skip: these sections are for testing 

In [2]:
## Naive forward pass 
# Dimension checked
# JODO: check correctedness
x = np.random.randn(10, 1, 128, 128)
w = np.random.randn(5, 1, 4, 4)
b = np.random.randn(5,)
conv_param = {'stride': 2, 'pad': 1}

out, cache = conv_forward_naive(x,w,b,conv_param)

In [3]:
out.shape

(10, 5, 64, 64)

In [7]:
## Naive backward pass
# Dimension checked
# JODO: check correcteness

#from sequential_model.fast_layers import *
#from sequential_model.gradient_check import eval_numerical_gradient_array, eval_numerical_gradient

x = np.random.randn(10, 1, 128, 128)
w = np.random.randn(5,1, 4,4)
b = np.random.randn(5,)
dout = np.random.randn(10, 5, 64, 64)
conv_param = {'stride': 2, 'pad': 1}

# dx_num = eval_numerical_gradient_array(lambda x: conv_forward_naive(x, w, b, conv_param)[0], x, dout)
# dw_num = eval_numerical_gradient_array(lambda w: conv_forward_naive(x, w, b, conv_param)[0], w, dout)
# db_num = eval_numerical_gradient_array(lambda b: conv_forward_naive(x, w, b, conv_param)[0], b, dout)

out, cache = conv_forward_naive(x, w, b, conv_param)
dx, dw, db = conv_backward_naive(dout, cache)

# Your errors should be around 1e-9'
# print('Testing conv_backward_naive function')
# print('dx error: ', rel_error(dx, dx_num))
# print('dw error: ', rel_error(dw, dw_num))
# print('db error: ', rel_error(db, db_num))

In [8]:
dx.shape

(10, 1, 128, 128)

In [9]:
## Sandwich layer forward + backward
# Dimension checked
from sequential_model.layers import *
from sequential_model.layer_utils import *
import numpy as np
## Conv_relu_forward/backward
from sequential_model.layer_utils import conv_relu_pool_forward, conv_relu_pool_backward

x = np.random.randn(10, 1, 128, 128)
w = np.random.randn(5,1,4,4)
b = np.random.randn(5,)
dout = np.random.randn(10, 5, 64, 64)
conv_param = {'stride': 2, 'pad': 1}
pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

out, cache = conv_relu_pool_forward(x, w, b, conv_param, pool_param)
dx, dw, db = conv_relu_pool_backward(dout, cache)

out, cache = conv_relu_forward(x, w, b, conv_param)
dx, dw, db = conv_relu_backward(dout, cache)

# dx_num = eval_numerical_gradient_array(lambda x: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], x, dout)
# dw_num = eval_numerical_gradient_array(lambda w: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], w, dout)
# db_num = eval_numerical_gradient_array(lambda b: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], b, dout)

# print 'Testing conv_relu_pool'
# print 'dx error: ', rel_error(dx_num, dx)
# print 'dw error: ', rel_error(dw_num, dw)
# print 'db error: ', rel_error(db_num, db)

### Decisions for frame duration 

In [None]:
spec_per_sec= spectrogram.shape[1]/librosa.get_duration(y=y,sr=sr)
print('Number of Spectrogram columns per sec:', spec_per_sec)
duration_per_specrogram_dim = len(y)/(spectrogram.shape[1]*sr) ## inverse of the above

In [None]:
frame_dim = 128 #fixed
print("Note: using %d spectrogram dim for each input, which is %.2f seconds per input" % (frame_dim, duration_per_specrogram_dim*frame_dim)) 
seq_len = 7 #fixed
print("Note: using %d inputs for each sequence, that is %.2f seconds per sequence" % (seq_len, duration_per_specrogram_dim*frame_dim*seq_len)) 

### Formatting into audio data with right dimension 
formatting audio file step by step to:
-> spectrogram of dim (128, _) 
-> around 20 sec sequences of dim (7, 128, 128)
-> padding of each song of irregular length to (k, 7, 128, 128) where k depends on song duration
-> reformatting for cnn (N, 1, 128, 128) where N = 7*k, and 1 is the channel for raw cnn input
TODO Note: N should first be merged to other songs and then build batch from there

In [2]:
def song_to_input_matrix(song_filename, frame_dim = 128, seq_len = 7):
    y, sr = librosa.load(filename)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    song_db = librosa.power_to_db(spectrogram, ref=np.max) ## now shape is (128, _)
    max_seq_dim = frame_dim * seq_len
    num_seq = int(song_db.shape[1]/max_seq_dim) + 1
    input_matrix = np.zeros((num_seq, seq_len,128, frame_dim)) #128 is frequency dim which is also fixed
    left_over_frame_dim = song_db.shape[1]
    for i in range(num_seq*seq_len):
        seq_index = int(i / 7)
        seq_pos = i % 7
        if left_over_frame_dim < frame_dim:
            left_over_frame = song_db[:, i*frame_dim:]
            padding = np.zeros((128,frame_dim - left_over_frame_dim))
            last_piece = np.concatenate((left_over_frame,padding), axis=1)
            input_matrix[seq_index,seq_pos,:,:] = last_piece
            left_over_frame_dim = max(0, left_over_frame_dim - frame_dim) ## Note: continue to pad till the end of the sequence
        else:
            input_matrix[seq_index,seq_pos,:,:] = song_db[:,i*frame_dim:(i+1)*frame_dim]
            left_over_frame_dim -= frame_dim
    
    cnn_input_matrix = np.expand_dims(input_matrix, axis=2).reshape((-1,1,128,128))
    return cnn_input_matrix
        

In [3]:
filename = 'test.mp3'
check_input = song_to_input_matrix(filename)
print(check_input.shape)

(70, 1, 128, 128)


In [4]:
## Formatting one song into a set of training data
sample_data = check_input
sample_label = np.random.randint(2, size=sample_data.shape[0])
train_ratio = 0.7
train_id = np.random.choice(sample_data.shape[0], int(sample_data.shape[0]*train_ratio))

data = {
    'X_train': sample_data[train_id], # training data
    'y_train': sample_label[train_id], # training labels
    'X_val': sample_data[-train_id],# validation data
    'y_val': sample_label[-train_id]# validation labels
  }

In [5]:
## checking forward pass
model = ConvNet(num_filters = [3,4], input_dim = (1,128,128),filter_sizes = [4,4], hidden_dims = [40], 
                conv_param = {'stride': 2, 'pad': 1}, pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2})

X = check_input #np.random.randn(N, 1, 128, 128)
N = X.shape[0]
y = np.random.randint(2, size=N)

loss, grads = model.loss(X, y)
print('Initial loss (no regularization): ', loss)

model.reg = 0.5
loss, grads = model.loss(X, y)
print('Initial loss (with regularization): ', loss)

Initial loss (no regularization):  0.6931471804405496
Initial loss (with regularization):  0.6931667512749604


In [8]:
## checking for training
model = ConvNet(num_filters = [3,4], input_dim = (1,128,128),filter_sizes = [4,4], hidden_dims = [40], 
                conv_param = {'stride': 2, 'pad': 1}, pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2},
                weight_scale=0.001, reg=0.001)

solver = Solver(model, data,
                num_epochs=4, batch_size=50,
                update_rule='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=20)
solver.train()

(Iteration 1 / 4) loss: 0.693147
(Epoch 1 / 4) train acc: 0.530612; val_acc: 0.530612
(Epoch 2 / 4) train acc: 0.530612; val_acc: 0.530612
(Epoch 3 / 4) train acc: 0.469388; val_acc: 0.469388
(Epoch 4 / 4) train acc: 0.469388; val_acc: 0.469388
