# Issues:
- AudioDescriptors class takes ~20 hours to compute
- AudioDescriptorsExtended a bit less but still too long

In [None]:
!pip install essentia-tensorflow
!pip install numpy pandas scikit-learn

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

from google.colab import auth
auth.authenticate_user()

In [None]:
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import json

import essentia.standard as es
import pandas as pd

import IPython.display as ipd
from tqdm import tqdm

from itertools import chain

In [4]:
WORKING_DIR = Path(os.getcwd())
MODELS_HOME = Path(os.getcwd(), 'models')
DATASET_PATH = Path('/Users/francescopapaleo/GDrive/essentia-playlist/MusAV/audio_chunks/')

!cd "$DATASET_PATH" && ls

[34maudio.000[m[m [34maudio.001[m[m [34maudio.002[m[m [34maudio.003[m[m [34maudio.004[m[m [34maudio.005[m[m [34maudio.006[m[m


In [None]:
! mkdir models
! curl -L -o models/voice_instrumental-musicnn-mtt-2.pb "https://essentia.upf.edu/models/classifiers/voice_instrumental/voice_instrumental-musicnn-mtt-2.pb"
! curl -L -o models/discogs-effnet-bs64-1.pb "https://essentia.upf.edu/models/music-style-classification/discogs-effnet/discogs-effnet-bs64-1.pb"
! curl -L -o models/msd-musicnn-1.pb "https://essentia.upf.edu/models/autotagging/msd/msd-musicnn-1.pb"
! curl -L -o models/emomusic-musicnn-msd-2.pb "https://essentia.upf.edu/models/classification-heads/emomusic/emomusic-musicnn-msd-2.pb"
! curl -L -o models/labels.py "https://raw.githubusercontent.com/MTG/essentia-replicate-demos/main/effnet-discogs/labels.py"

!ls models

In [21]:
# function to get a list of all audio files to be analysed under a folder 

def file_walker(dir_to_analyse, list_type):

    def get_files_abspath(dir_to_analyse):
        names_abs_list = []
        for (dirpath, dirnames, filenames) in os.walk(dir_to_analyse):
            for f in filenames:
                tmp_abs_path = os.path.join(dirpath, f)
                print(tmp_abs_path)
                names_abs_list.append(tmp_abs_path)
        return names_abs_list

    def get_files_relpath(dir_to_analyse):
        names_rel_list = []
        for (dirpath, dirnames, filenames) in os.walk(dir_to_analyse):
            for f in filenames:
                tmp_rel_path = os.path.relpath(dirpath, dir_to_analyse)
                tmp_file_path = os.path.join(tmp_rel_path, f)
                names_rel_list.append(tmp_file_path)
        return names_rel_list

    def names_only(dir_to_analyse):
        names_list = []
        for (dirpath, dirnames, filenames) in os.walk(dir_to_analyse):
            for f in filenames:
                names_list.append(f)
        return names_list

    if list_type == 'abs':
        return get_files_abspath(dir_to_analyse)
    elif list_type == 'rel':
        return get_files_relpath(dir_to_analyse)
    elif list_type == 'names':
        return names_only(dir_to_analyse)
    else:
        print('Error: list_type must be either "abs" or "rel" or "names"')
        return None
    

In [None]:
# Check if the files are already listed, if not calling the function

if not os.path.exists('data/all_files_list.json'):
    print('Listing all files in the', DATASET_PATH)
    with open('data/all_files_list.json', 'w') as f:
        all_files_list = []
        all_files_list = file_walker(DATASET_PATH, 'abs')
        json.dump(all_files_list, f)
    print("File list saved to all_files_list.json")
    print("The list contains", len(all_files_list), "files")
else:
    print("File list exists, you can run one of the descriptors clases")
    with open('data/all_files_list.json', 'r') as f:
        all_files_list = json.load(f)
    print("The list contains", len(all_files_list), "files")
if len(all_files_list) == 0:
    print("No audio files found in the specified directory and its subdirectories, please check the path and try again")

In [24]:
### CLASS to compute audio descriptors WITH STYLE ACTIVATIONS for 400 STYLES ### 

# voice_instrumental-msd-musicnn-1.pb

class AudioDescriptorsExtended:
    def __init__(self):
        self.model_effnet = es.TensorflowPredictEffnetDiscogs(graphFilename='/Users/francescopapaleo/Dropbox/Mac/Documents/git-box/streamlit/models/discogs-effnet-bs64-1.pb')
        self.model_vi = es.TensorflowPredictMusiCNN(graphFilename='/Users/francescopapaleo/Dropbox/Mac/Documents/git-box/streamlit/models/voice_instrumental-musicnn-mtt-2.pb', output='model/dense/BiasAdd')
        self.model_av_emb = es.TensorflowPredictMusiCNN(graphFilename="/Users/francescopapaleo/Dropbox/Mac/Documents/git-box/streamlit/models/msd-musicnn-1.pb", output='model/dense/BiasAdd')
        self.model_av = es.TensorflowPredict2D(graphFilename='/Users/francescopapaleo/Dropbox/Mac/Documents/git-box/streamlit/models/emomusic-musicnn-msd-2.pb', output='model/Identity')

    def tempo_dance(self, path_to_file):
        audio = es.MonoLoader(filename=path_to_file, sampleRate=44100)()
        bpm, beats, beats_confidence, _, beats_intervals = es.RhythmExtractor2013()(audio)
        danceability, dfa = es.Danceability()(audio)
        return bpm, danceability

    def audio_16(self, path_to_file):
        audio_load_16 = es.MonoLoader(filename=path_to_file, sampleRate=16000)()
        return audio_load_16

    def style_ml(self, audio_load_16):
        activations = self.model_effnet(audio_load_16)
        activations_mean = np.mean(activations, axis=0)
        activations_list = list(activations_mean.astype(float))
        return activations_list


    def vi_ml(self, audio_load_16):
        activations = self.model_vi(audio_load_16)
        v_i_mean = np.mean(activations, axis=0, keepdims=True)[0]
        vi_tmp = (v_i_mean + 1) / 2  # Scale to range [0, 1]
        vi_scaled = vi_tmp.tolist()
        return vi_scaled

    def av_ml(self, audio_load_16):
        embeddings = self.model_av_emb(audio_load_16)
        activations = self.model_av(embeddings)
        activations_mean = np.mean(activations, axis=0, keepdims=True)[0]
        valence = activations_mean[0]
        arousal = activations_mean[1]
        return valence, arousal

    def compute_descriptors(self, files_list):
        all_descriptors = []
        for file_path in files_list:
            rel_path = os.path.relpath(file_path)
            audio_16 = self.audio_16(file_path)
            activations_list = self.style_ml(audio_16)
            vi_scaled = self.vi_ml(audio_16)
            valence, arousal = self.av_ml(audio_16)
            bpm, danceability = self.tempo_dance(file_path)

            descriptor_dict = {
                'file_path': rel_path,
                'bpm': str(bpm),
                'danceability': str(danceability),
                'style_activations': str(activations_list),
                'vi_scaled': str(vi_scaled),
                'valence': str(valence),
                'arousal': str(arousal)
            }
            all_descriptors.append(descriptor_dict)

        return all_descriptors

In [None]:
extended_descriptors = AudioDescriptorsExtended()

with open("extended_descriptors_output.json", "w") as f:
    for file_path in tqdm(all_files_list):
        features = extended_descriptors.compute_descriptors([file_path])
        json.dump(features, f, indent=1)