# Primary feature choices for cover song identification in da-Tacos
Using chroma, crema, hpcp, and a fusion of all, beatsynchronized

### Library importing

In [10]:
import json
import dill
import librosa
import numpy as np
import scipy
import deepdish as dd

### Dill session

In [None]:
dill.dump_session('datacos_reps.db')

In [None]:
dill.load_session('datacos_reps.db')

### Load metadata of subset of performances to work with

In [6]:
#base directory
base = '/Volumes/Extreme SSD/da-tacos/'

#read metadata
with open(base+'da-tacos_metadata/da-tacos_benchmark_subset_metadata.json') as f:
    benchmark_metadata = json.load(f)

#dictionary holding works:performances:metadata
subset_metadata = {}

W_count = 0
#traverse works
for W in benchmark_metadata.keys():
    #if it contains at least 5 covers
    if len(benchmark_metadata[W].keys()) >= 5: 
        W_count += 1
        per = {}
        #get performances
        P_count = 0
        for P in benchmark_metadata[W].keys():
            P_count += 1
            per[P] = benchmark_metadata[W][P]
            if P_count >= 5: #number of performances per work
                break
        subset_metadata[W] = per
    if W_count >= 50: #number of works
        break

### Compute sets of approximations using chroma, crema, hpcp

In [15]:
X = {}

#traverse Works
for W in subset_metadata.keys():

    per = {}
    #traverse Performances
    for P in subset_metadata[W].keys():

        #open performance data
        filepath = base + "da-tacos_benchmark_subset_single_files/" + W + "/" + P + ".h5"
        data = dd.io.load(filepath)

        #Beat synchronize MFCCs
        synced_mfcc = librosa.util.sync(data['mfcc_htk'], data["madmom_features"]["onsets"], aggregate=np.median)

        #Similarity Sequence Matrix using Gaussian Kernel
        path_distance = np.sum(np.diff(synced_mfcc, axis=1)**2, axis=0)
        sigma = np.median(path_distance)
        path_sim = np.exp(-path_distance / sigma)
        Sloc = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1)

        rep_set = {}
        #features to use
        for rep in ["chroma_cens", "crema", "hpcp"]:
            
            #Beat synchronization
            synced = librosa.util.sync(data[rep], data["madmom_features"]["onsets"], aggregate=np.median)

            #Short-term History Embedding
            steps = 4
            stacked = librosa.feature.stack_memory(synced, steps)

            #Weighted Recurrence Matrix
            knn_no = 3
            ssm = librosa.segment.recurrence_matrix(stacked, width=knn_no, mode='affinity', sym=True)

            #Timelag Filter & Path Enchancement
            df = librosa.segment.timelag_filter(scipy.ndimage.median_filter)
            Srep = librosa.segment.path_enhance(df(ssm, size=(1,7)), 15)

            #Balanced Combination
            deg_loc = np.sum(Sloc, axis=1)          
            deg_rep = np.sum(Srep, axis=1)
            mu = deg_loc.dot(deg_loc + deg_rep) / np.sum((deg_loc + deg_rep)**2)
            A = mu * Srep + (1 - mu) * Sloc

            #Downsampling
            A_d = cv2.resize(A, (256, 256))

            #Laplacian
            L = scipy.sparse.csgraph.laplacian(A_d, normed=True)

            #Eigendecomposition
            evals, evecs = scipy.linalg.eigh(L)

            #Eigenvector filtering
            evecs = scipy.ndimage.median_filter(evecs, size=(9, 1))

            #Normalization
            Cnorm = np.cumsum(evecs**2, axis=1)**0.5

            #Set of eigenvector distances
            dist_set = []
            for k in range(2, 10):
                X = evecs[:, :k] / Cnorm[:, k-1:k]
                distance = squareform(pdist(X, metric='euclidean'))
                dist_set.append(distance)
                
            #Add representation
            rep_set[rep] = dist_set

        #Add representations to performance
        per[P] = rep_set

    #Add performances to work
    X[W] = per

ParameterError: width=3 must be at least 1 and at most data.shape[-1]=1