In [7]:
import os 

import librosa as li
import matplotlib.pyplot as plt
import numpy as np
import onnxruntime
import panel as pn
pn.extension('ipywidgets')
import sklearn.decomposition
import soundfile as sf
import torch
torch.set_grad_enabled(False)
from tqdm import tqdm
import umap
from IPython.display import display, Audio

_AVAILABLE_EXTS =  [k.lower() for k in  sf.available_formats().keys()]

In [3]:
LOAD = True
SAVE = False
ort_session = onnxruntime.InferenceSession("/data/genova/msclap_onnx/clap.onnx", providers=["CPUExecutionProvider"])
samples_folder = '/data/genova/datasets/Drum kits/Hip Hop & Lofi/Boom Bap Essentials'
if samples_folder is None:
    raise ValueError('Please select a valid folder of samples')
files = []
for root, _, _files in os.walk(samples_folder):
    for f in tqdm(_files, leave=False):
        if np.any([f.endswith(ext) for ext in _AVAILABLE_EXTS]):
            files.append(os.path.join(root, f)) 
embeddings = []

if LOAD:
    with open('embeddings.npy', 'rb') as f:
        embeddings = np.load(f)
else:
    for file in tqdm(files):
        y, sr = li.load(path=file)
        if y.shape[0]==2:
            y = y.mean(0)[None, :]
        if sr!=22050:
            y = li.resample(y, sr, 22050)
        if y.ndim==1:
            y = y[None, :]
        if y.shape[-1]<int(.5*22050): # Minimal duration for CLAP seems to be 0.5 seconds, so we need to pad everything
            delta = int(.5*22050)-y.shape[-1]
            y = np.concatenate((y, np.zeros((1, delta))), axis=-1)
        y = y.astype(np.float32)
        ort_inputs = {ort_session.get_inputs()[0].name: y}
        ort_outs = ort_session.run(None, ort_inputs)
        embeddings.append(ort_outs[0])
    embeddings = np.concatenate(embeddings, axis=0)
    if SAVE:
        with open('embeddings.npy', 'wb') as f:
            np.save(f, embeddings)

                                                                                                                                                                                                                              

In [14]:
reducer = umap.UMAP().fit(embeddings)
embs_2d = reducer.embedding_

In [15]:
NORM = False
SCALE = 0

if NORM:
    x = 2*((embs_2d[:, 0]-embs_2d[:, 0].min())/(embs_2d[:, 0].max()-embs_2d[:, 0].min()))-1
    y = 2*((embs_2d[:, 1]-embs_2d[:, 1].min())/(embs_2d[:, 1].max()-embs_2d[:, 1].min()))-1
else:
    x = embs_2d[:, 0]
    y = embs_2d[:, 1]

if SCALE:
    x*=SCALE
    y*=SCALE

In [16]:
import random
NUM_SAMPLES = 5 
rand_idx, rand_file = random.choice(list(enumerate(files)))
print('Target audio')
display(Audio(filename=rand_file))

print('Without UMAP')
rand_coords = embeddings[rand_idx][None, :]
dists = np.sqrt(((embeddings - rand_coords)**2).sum(-1))
sorted_idx = np.argsort(dists)
for i in sorted_idx[:NUM_SAMPLES+1]:
    display(Audio(filename=files[i]))

print('Using UMAP')
rand_coords = embs_2d[rand_idx][None, :]
dists = np.sqrt(((embs_2d - rand_coords)**2).sum(-1))
sorted_idx = np.argsort(dists)
for i in sorted_idx[:NUM_SAMPLES+1]:
    display(Audio(filename=files[i]))

Target audio


Without UMAP


Using UMAP


In [31]:
unseen_file = '/data/genova/datasets/Drum kits/A cappella & loops/Cymatics - Vocal Essentials/Vocal Arps/Cymatics - Vocal Essentials Arp 2 - 100 BPM F Maj.wav'
y, sr = li.load(path=unseen_file)
if y.shape[0]==2:
    y = y.mean(0)[None, :]
if sr!=22050:
    y = li.resample(y, sr, 22050)
if y.ndim==1:
    y = y[None, :]
if y.shape[-1]<int(.5*22050): 
    delta = int(.5*22050)-y.shape[-1]
    y = np.concatenate((y, np.zeros((1, delta))), axis=-1)
y = y.astype(np.float32)
ort_inputs = {ort_session.get_inputs()[0].name: y}
ort_outs = ort_session.run(None, ort_inputs)
unseen_emb = ort_outs[0]
unseen_emb_2d = reducer.transform(unseen_emb)

In [32]:
print('Unseen audio')
display(Audio(filename=unseen_file))

print('Closest audios')
dists = np.sqrt(((embs_2d - unseen_emb_2d)**2).sum(-1))
sorted_idx = np.argsort(dists)
for i in sorted_idx[:NUM_SAMPLES+1]:
    display(Audio(filename=files[i]))

Unseen audio


Closest audios
