In [1]:
# first test for polyphonic music
import librosa
import numpy as np
import os
from sporco.admm import cbpdn
import sporco.metric as sm
import pickle 
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.signal import argrelextrema
import scipy.io.wavfile
sr = 11025

In [2]:
# load the dictionary
D = []
directory = './maps/notes/'

for i in range(21,109):
    fname1 = 'MAPS_ISOL_NO_F_S0_M' + str(i) + '_SptkBGCl.wav'
    fname2 = 'MAPS_ISOL_NO_F_S1_M' + str(i) + '_SptkBGCl.wav'
    
    try:
        s = librosa.load(directory + fname1, sr=11025,offset=0.7,duration=1.0)[0]
    except:
        s = librosa.load(directory + fname2, sr=11025,offset=0.7,duration=1.0)[0]
    # normalize dictionary elements
    D.append(s/np.amax(s))
    
D = np.asarray(D)
D = D.T

with open('mapsdict.pkl','wb') as fid:
     pickle.dump(D,fid)

In [13]:
#solve the CBPDN problem
with open('mapsdict.pkl','rb') as fid:
    D = pickle.load(fid)
print(D.shape)
lmbda = 0.005
# 1 temporal dimension
dimN = 1

opt = cbpdn.ConvBPDN.Options({'Verbose' : True, 'MaxMainIter' : 500,
                    'HighMemSolve' : True, 'LinSolveCheck' : True,
                    'RelStopTol' : 1e-3, 'AuxVarObj' : False})

# load the song
songdir = "./maps/songs/"
songname = "MAPS_MUS-alb_esp2_SptkBGCl.wav"
song, sr = librosa.load(songdir+songname,sr=sr,duration=5.0)

# load the GT
f = open(songdir + 'MAPS_MUS-alb_esp2_SptkBGCl.txt', 'rb')
lines = f.readlines()[1:]
GT = {}
for row in lines:
    row = row.decode().strip('\r\n')
    row = row.split('\t')
    if row[2] in GT:
        GT[row[2]].append([row[0], row[1]])
    else:
        GT[row[2]] = [[row[0], row[1]]]



(11025, 88)


In [None]:
# sparce coding
b = cbpdn.ConvBPDN(D, song, lmbda, opt,dimN=dimN)
X = b.solve()
X = X[:,0,0,:]
with open('maps_results_esp2.pkl', 'wb') as fid:
    pickle.dump(X, fid)

In [14]:
with open('maps_results_esp2.pkl','rb') as Y:
    Y = pickle.load(Y)
    
duration = song.shape[0]/sr

lmaxes = np.asarray([])
for row in Y:
    relex = argrelextrema(row,np.greater)[0]
    lmaxes = np.concatenate((lmaxes,row[relex]))

p75,p25 = np.percentile(lmaxes, [75,25])
# try different values for iqr
threshold = p75 + 12*(p75-p25)
# lmaxes = np.sort(lmaxes)
# l = lmaxes.shape[0]
# threshold = lmaxes[95*int(l/100)]
# need to select proper threshold value
print('threshold is', threshold)
# new array of only below threshold
lowvals = Y < threshold
Y[lowvals] = 0


threshold is 0.00183084793389


In [21]:
# peak is OK if it occurs within [onset - 0.05, offset+0.05]
#duration of each window
window_size = 0.05
start = 0
l = int(sr*window_size)
end = l
window = 0
print('finding maxima')

correct = 0
total = 0
for i in range(int(duration/window_size)):
    row = Y[start:end,:]
    sumrow = np.sum(row,axis=0)
    relex = argrelextrema(sumrow,np.greater)[0]
    if(len(relex) != 0):
        t = window_size * window
        for note in relex:
            note += 21
            total += 1
            if str(note) in GT:
                time_ranges = GT[str(note)]
                for time_range in time_ranges:
                    if t < float(time_range[0]) - 0.05:
                        break
                    elif t >= float(time_range[0]) - 0.05 and t <= float(time_range[1]) + 0.05:
                        correct += 1
                        break
                            
    start = end
    end += l
    window+=1

correct_percentage = float(correct) / total
print(total)
print ("Correct percentage is: " + str(correct_percentage))

finding maxima
30
Correct percentage is: 0.8666666666666667
