In [1]:
from transformers import AutoFeatureExtractor, AutoModel
from IPython.display import Audio as player
from datasets import load_dataset, Audio
from panns_inference import AudioTagging
from qdrant_client import QdrantClient
from qdrant_client.http import models
from os.path import join
from glob import glob
import pandas as pd
import numpy as np
import librosa
import openl3
import torch

2024-01-04 22:46:15.970428: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-04 22:46:15.993853: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-04 22:46:15.993879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-04 22:46:15.994543: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-04 22:46:15.998859: I tensorflow/core/platform/cpu_feature_guar

In [2]:
client = QdrantClient(host="localhost", port=6333)

In [3]:
my_collection = "music_collection"
client.recreate_collection(
    collection_name=my_collection,
    vectors_config=models.VectorParams(size=2048, distance=models.Distance.COSINE)
)

True

# Load dataset

In [4]:
data_path = join("/", "home", "egor", "datasets", "ludwig_music_data")
data_path

'/home/egor/datasets/ludwig_music_data'

In [5]:
music_data = load_dataset(
    "audiofolder", data_dir=join(data_path, "mp3", "latin"), split="train", drop_labels=True
)
music_data

Resolving data files:   0%|          | 0/979 [00:00<?, ?it/s]

Dataset({
    features: ['audio'],
    num_rows: 979
})

In [6]:
music_data[115]

{'audio': {'path': '/home/egor/datasets/ludwig_music_data/mp3/latin/0rXvhxGisD2djBmNkrv5Gt.mp3',
  'array': array([ 0.00000000e+00,  1.24776700e-09, -4.54397187e-10, ...,
         -7.98814446e-02, -8.84955898e-02, -1.05223551e-01]),
  'sampling_rate': 44100}}

In [7]:
ids = [
    (
     music_data[i] # for every sample
     ['audio'] # in this directory
     ['path'] # extract the path
     .split("/") # split it by /
     [-1] # take only the last piece "id.mp3"
     .replace(".mp3", '') # and replace the .mp3 with nothing
    ) 
    for i in range(len(music_data))
]
index = [num for num in range(len(music_data))]
ids[:4]

['0010BnyFuw94XFautS2uJp',
 '00RhgYVH6DrHl0SuZWDp8W',
 '01k69xxIQGL94F8IfIkI5l',
 '02GUIyXZ9RNusgUocEQIzN']

In [8]:
music_data = music_data.add_column("index", index)
music_data = music_data.add_column("ids", ids)
music_data[-1]

{'audio': {'path': '/home/egor/datasets/ludwig_music_data/mp3/latin/7yX4WgUfoPpMKZHgqpaZ0x.mp3',
  'array': array([ 0.00000000e+00, -1.40022882e-09, -4.44221415e-09, ...,
         -9.52053051e-02, -8.90597273e-02, -8.10846481e-02]),
  'sampling_rate': 44100},
 'index': 978,
 'ids': '7yX4WgUfoPpMKZHgqpaZ0x'}

In [9]:
label_path = join(data_path, "labels.json")
labels = pd.read_json(label_path)
labels.head()

Unnamed: 0,tracks
000QWvZpHrBIVrW4dGbaVI,{'otherSubgenres': {'L': [{'S': 'electronic---...
0010BnyFuw94XFautS2uJp,"{'otherSubgenres': {'L': [{'S': ' world'}, {'S..."
0055LRFB7zfdCXDGodyIz3,"{'otherSubgenres': {'L': []}, 'artist': {'S': ..."
005Dlt8Xaz3DkaXiRJgdiS,"{'otherSubgenres': {'L': [{'S': 'rock'}, {'S':..."
006RpKEKItNO4q8TkAUpOv,{'otherSubgenres': {'L': [{'S': 'classical---c...


In [11]:
def get_metadata(x):
    cols = ['artist', 'genre', 'name', 'subgenres']
    list_of_cols = []
    for col in cols:
        try:
            mdata = list(x[col].values())[0]
        except:
            mdata = "Unknown"
        list_of_cols.append(mdata)

    return pd.Series(list_of_cols, index=cols)

In [12]:
clean_labels = labels['tracks'].apply(get_metadata).reset_index()
clean_labels.head()

Unnamed: 0,index,artist,genre,name,subgenres
0,000QWvZpHrBIVrW4dGbaVI,047,electronic,General Error,[{'S': 'electronic---synth-pop'}]
1,0010BnyFuw94XFautS2uJp,Jimmy Buffett,latin,La Vie Dansante,[{'S': 'latin---cubano'}]
2,0055LRFB7zfdCXDGodyIz3,New Order,rock,Doubts Even Here,[{'S': 'rock---new wave'}]
3,005Dlt8Xaz3DkaXiRJgdiS,Ricardo Arjona,rock,Historia de Taxi,[{'S': 'rock---pop rock'}]
4,006RpKEKItNO4q8TkAUpOv,Worrytrain,electronic,They Will Make My Passage Easy,[{'S': 'electronic---ambient'}]


In [13]:
def get_vals(genres):
    genre_list = []
    for dicts in genres:
        if type(dicts) != str:
            for _, val in dicts.items():
                genre_list.append(val)
    return genre_list

clean_labels['subgenres'] = clean_labels.subgenres.apply(get_vals)
clean_labels['subgenres'].head()

0    [electronic---synth-pop]
1            [latin---cubano]
2           [rock---new wave]
3           [rock---pop rock]
4      [electronic---ambient]
Name: subgenres, dtype: object

In [14]:
file_path = join(data_path, "mp3", "latin", "*.mp3")
files = glob(file_path)
ids = [i.split('/')[-1].replace(".mp3", '') for i in files]
music_paths = pd.DataFrame(zip(ids, files), columns=["ids", 'urls'])
music_paths.head()

Unnamed: 0,ids,urls
0,5AiPEB0ibDHZLnVfkOfHJT,/home/egor/datasets/ludwig_music_data/mp3/lati...
1,5irSOlA7zXrwkJHVPgyVZQ,/home/egor/datasets/ludwig_music_data/mp3/lati...
2,6607INWM4gWwj0bGFVo4vz,/home/egor/datasets/ludwig_music_data/mp3/lati...
3,1Z8LrlAVTxVeuKjRa0xg1a,/home/egor/datasets/ludwig_music_data/mp3/lati...
4,4ZhzXzvYuF27CfaOfYT8u2,/home/egor/datasets/ludwig_music_data/mp3/lati...


In [15]:
metadata = (music_data.select_columns(['index', 'ids'])
                     .to_pandas()
                     .merge(right=clean_labels, how="left", left_on='ids', right_on='index')
                     .merge(right=music_paths, how="left", left_on='ids', right_on='ids')
                     .drop("index_y", axis=1)
                     .rename({"index_x": "index"}, axis=1)
        )
metadata.head()

Unnamed: 0,index,ids,artist,genre,name,subgenres,urls
0,0,0010BnyFuw94XFautS2uJp,Jimmy Buffett,latin,La Vie Dansante,[latin---cubano],/home/egor/datasets/ludwig_music_data/mp3/lati...
1,1,00RhgYVH6DrHl0SuZWDp8W,Jimmy Buffett,latin,Brown Eyed Girl,[latin---cubano],/home/egor/datasets/ludwig_music_data/mp3/lati...
2,2,01k69xxIQGL94F8IfIkI5l,Los Delinqüentes,latin,Fumata Del Ladrillo,"[latin---flamenco, rock---punk]",/home/egor/datasets/ludwig_music_data/mp3/lati...
3,3,02GUIyXZ9RNusgUocEQIzN,La Bottine Souriante,latin,Ma Paillasse,[latin---salsa],/home/egor/datasets/ludwig_music_data/mp3/lati...
4,4,02IFfsWwxek6h9qLEH4sRA,Gipsy Kings,latin,Estrellas,[latin---flamenco],/home/egor/datasets/ludwig_music_data/mp3/lati...


In [16]:
payload = metadata.drop(['index', 'ids'], axis=1).to_dict(orient="records")
payload[:3]

[{'artist': 'Jimmy Buffett',
  'genre': 'latin',
  'name': 'La Vie Dansante',
  'subgenres': ['latin---cubano'],
  'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/0010BnyFuw94XFautS2uJp.mp3'},
 {'artist': 'Jimmy Buffett',
  'genre': 'latin',
  'name': 'Brown Eyed Girl',
  'subgenres': ['latin---cubano'],
  'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/00RhgYVH6DrHl0SuZWDp8W.mp3'},
 {'artist': 'Los Delinqüentes',
  'genre': 'latin',
  'name': 'Fumata Del Ladrillo',
  'subgenres': ['latin---flamenco', 'rock---punk'],
  'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/01k69xxIQGL94F8IfIkI5l.mp3'}]

# Embeddings

In [17]:
at = AudioTagging(checkpoint_path=None, device='cuda')

Checkpoint path: /home/egor/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1


In [18]:
def get_panns_embs(batch):
    arrays = [torch.tensor(val['array'], dtype=torch.float64) for val in batch['audio']]
    inputs = torch.nn.utils.rnn.pad_sequence(arrays, batch_first=True, padding_value=0).type(torch.cuda.FloatTensor)
    _, embedding = at.inference(inputs)
    batch['panns_embeddings'] = embedding
    return batch

In [19]:
music_data = music_data.map(get_panns_embs, batched=True, batch_size=8)
music_data

Dataset({
    features: ['audio', 'index', 'ids', 'panns_embeddings'],
    num_rows: 979
})

# Building a Recommendation System

## Insert data to qdrant

In [20]:
client.upsert(
    collection_name=my_collection,
    points=models.Batch(
        ids=music_data['index'],
        vectors=music_data['panns_embeddings'],
        payloads=payload
    )
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [21]:
result = client.retrieve(
    collection_name=my_collection,
    ids=[100],
    with_vectors=True # we can turn this on and off depending on our needs
)
result[0].payload

{'artist': 'La Bottine Souriante',
 'genre': 'latin',
 'name': 'Chant de la luette',
 'subgenres': ['latin---salsa'],
 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/0lyeChzw7IWf9ytZ7S0jDK.mp3'}

In [22]:
metadata.query("artist == 'Celia Cruz'")

Unnamed: 0,index,ids,artist,genre,name,subgenres,urls
122,122,0v1oaOqkXpubdykx58BQwY,Celia Cruz,latin,Juancito Trucupey,[latin---salsa],/home/egor/datasets/ludwig_music_data/mp3/lati...
150,150,19zWrDlXew0Fzouu7a4qhx,Celia Cruz,latin,Cuando Sali De Cuba,[latin---salsa],/home/egor/datasets/ludwig_music_data/mp3/lati...
178,178,1MYds6o9aN2Wxa4TDxcJPB,Celia Cruz,latin,Mi vida es cantar,[latin---salsa],/home/egor/datasets/ludwig_music_data/mp3/lati...
459,459,3WphzI2fb2NTUsfja51U7P,Celia Cruz,latin,Dile que por mi no tema,[latin---salsa],/home/egor/datasets/ludwig_music_data/mp3/lati...


## Search by vector

In [23]:
search_result = client.search(
    collection_name=my_collection,
    query_vector=music_data[150]['panns_embeddings'],
    limit=10
)
search_result

[ScoredPoint(id=150, version=0, score=1.0, payload={'artist': 'Celia Cruz', 'genre': 'latin', 'name': 'Cuando Sali De Cuba', 'subgenres': ['latin---salsa'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/19zWrDlXew0Fzouu7a4qhx.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=730, version=0, score=0.9206339, payload={'artist': 'Cartola', 'genre': 'latin', 'name': 'Fita meus olhos', 'subgenres': ['latin---samba'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/5iyRJ796USPTXEO4JXO0gC.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=251, version=0, score=0.9088161, payload={'artist': "Oscar D'León", 'genre': 'latin', 'name': 'Volver a Verte', 'subgenres': ['latin---salsa'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/1kD5EOoZ45kjq50NLfhRGc.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=739, version=0, score=0.90296745, payload={'artist': 'Cartola', 'genre': 'latin', 'name': 'Verde que te quero rosa', 'subgenres': ['latin---samba'], 'urls': '/h

## Use recommendation API

In [24]:
client.recommend(
    collection_name=my_collection,
    positive=[178, 122],
    limit=5
)

[ScoredPoint(id=384, version=0, score=0.9668472, payload={'artist': 'Gilberto Santa Rosa', 'genre': 'latin', 'name': 'Perdoname', 'subgenres': ['latin---salsa'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/2qqrgPaRZow7lrLttDL6Im.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=424, version=0, score=0.9633508, payload={'artist': 'Gilberto Santa Rosa', 'genre': 'latin', 'name': 'Amanecer Borincano', 'subgenres': ['latin---salsa'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/39FQfusOwKnPCjOgQHcx6S.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=190, version=0, score=0.9624211, payload={'artist': 'Luigi Texidor', 'genre': 'latin', 'name': 'Mi Testamento', 'subgenres': ['latin---salsa'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/1RIdI5c7RjjagAcMA5ixpv.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=92, version=0, score=0.9598112, payload={'artist': 'Tito Puente', 'genre': 'latin', 'name': 'Mambo Gozón', 'subgenres': ['latin---samba'], '

## Use filters

In [25]:
marc_anthony_valio_la_pena = music_data[301]

samba_songs = models.Filter(
    must=[models.FieldCondition(key="subgenres", match=models.MatchAny(any=['latin---samba']))]
)

results = client.recommend(
    collection_name=my_collection,
    query_filter=samba_songs,
    positive=[marc_anthony_valio_la_pena['index'], 178, 122, 459],
    negative=[385],
    limit=5
)
results

[ScoredPoint(id=540, version=0, score=0.8629072, payload={'artist': 'Tito Puente', 'genre': 'latin', 'name': 'Cual Es La Idea', 'subgenres': ['latin---samba'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/4CNCGwxNp9rnVqo2fzmDYK.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=493, version=0, score=0.8370813, payload={'artist': 'Tito Nieves', 'genre': 'latin', 'name': 'De mi enamórate', 'subgenres': ['latin---samba'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/3nnQUYKWBmHlfm5XpdWqNr.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=92, version=0, score=0.79911536, payload={'artist': 'Tito Puente', 'genre': 'latin', 'name': 'Mambo Gozón', 'subgenres': ['latin---samba'], 'urls': '/home/egor/datasets/ludwig_music_data/mp3/latin/0hk1gSyn3wKgdxqF6qaKUZ.mp3'}, vector=None, shard_key=None),
 ScoredPoint(id=856, version=0, score=0.7816363, payload={'artist': 'Tito Puente', 'genre': 'latin', 'name': 'Son de la Loma', 'subgenres': ['latin---samba'], 'urls': '/home