In [3]:
import glob, os
import numpy as np
import pandas as pd
import soundfile as sf
from functools import partial
from pathlib import Path



import yamnet_model.params as yamnet_params
import yamnet_model.yamnet as yamnet_model

from src.features import FeatureExtractor


from omegaconf import OmegaConf

from src.data import load_metadata, find_paths
# from src.features.YAMNetExtractor import YAMNetExtractor
import src.utils

from omegaconf import OmegaConf
conf = OmegaConf.load("./config.yaml")

# Prepare topic task

We need to create audio fragments and yamnet embeddings for the topic task


## Step 1: Create audio files

In [4]:
# TODO

## Step 2: Create yamnet embeddings

In [5]:

class YAMNetExtractor(FeatureExtractor):
    """Class for feature extraction with YAMNet.

    example:
    ex = YAMNetExtractor()
    ex.embedding(input_paths, output_paths, embed_paths)
    """

    def __init__(self, logfile="./log_yamnet", yamnet_path="./yamnet_class_map.csv"):
        """Init method for YAMNetExtractor."""
        super().__init__(logfile=logfile)
        self.model_checkpoint = os.path.join("./yamnet_model/yamnet.h5")
        
        #oud
        #self.class_names = os.path.join(
        #    os.getenv("YAMNET_PATH"), "yamnet_class_map.csv"
        #)
        
        # new
        print("init with yamnet path: ", yamnet_path)
        self.class_names = yamnet_path
        self.sample_rate = 44100

    def embedding(self, input_paths, output_paths):
        """Extract YAMnet features with opensmile using a single process."""
        save_embedding = True

        paths = list(zip(input_paths, output_paths))

        params = yamnet_params.Params(
            sample_rate=self.sample_rate, patch_hop_seconds=0.48
        )

        class_names = yamnet_model.class_names(self.class_names)
        yamnet = yamnet_model.yamnet_frames_model(params)
        yamnet.load_weights(self.model_checkpoint)

        func = partial(
            self._embed,
            yamnet=yamnet,
            params=params,
            class_names=class_names,
            save_embedding=save_embedding,
        )

        self.single_process(func, paths)

    @staticmethod
    def _embed(paths, yamnet, params, class_names, save_embedding=False):
        """Individual YAMnet extraction process."""

        input_path, embed_path = paths
        #print("[del] paths, should exist: ", input_path)
        input_path_exists, output_path_exists = FeatureExtractor.feature_path_checker(
            input_path, embed_path
        )
        
        print("[del] will it be true: ", input_path_exists and not output_path_exists)
        print('Should exist: ', input_path, input_path_exists)
        print('Should not exist: ', embed_path, output_path_exists)
        if input_path_exists and not output_path_exists:
            print("[del] if statement TRUE! reading ", input_path)
            wav_data, sr = sf.read(input_path, dtype=np.int16)
            print("[del] wav_data: ", wav_data)
            waveform = np.mean(wav_data, axis=1) / 32768.0

            approx_size = int(
                len(waveform) / params.sample_rate / params.patch_hop_seconds
            )  # approximate (overestimated) size of output
            embedding = np.zeros((approx_size, 1024))
            score = np.zeros((approx_size, 521))

            waveform_size = len(waveform)
            i = 0
            n_seconds = 300
            di = int(n_seconds * params.sample_rate)  # 5min segments

            real_size = 0
            while i <= waveform_size:
                scores, embeddings, spectrogram = yamnet(
                    waveform[i : i + di + int(0.47 * params.sample_rate)]
                ) # the 0.47s has to do with yamnet windowing
                scores = scores.numpy()
                print("[del] scores: ", scores)
                embeddings = embeddings.numpy()

                embedding[real_size : real_size + len(scores)] = embeddings
                score[real_size : real_size + len(scores)] = scores

                real_size += len(scores)
                i += di

            if save_embedding:
                print("[del] Will also save the embedding, to: ", embed_path)
                _, _ = FeatureExtractor.feature_path_checker(
                    input_path, embed_path
                )  # also create embed path if necessary

                df = pd.DataFrame(embedding)
                df["time (s)"] = np.arange(len(embedding)) * 0.48
                df.set_index("time (s)", inplace=True)
                df.astype(np.float16).to_hdf(
                    embed_path, "embedding", mode="w", complevel=6
                )
                del df

#             df = pd.DataFrame(score, columns=class_names)
#             df["time (s)"] = np.arange(len(score)) * 0.48
#             df.set_index("time (s)", inplace=True)
#             df.astype(np.float16).to_hdf(output_path, "score", mode="w", complevel=6)

            del df
            del embedding
            del score
            del spectrogram



In [17]:


def find_files(directory, ext='.ogg'):
    file_list = []
    for path in Path(directory).rglob('*{}'.format(ext)):
        print(path)
        file_list.append(str(path))
        
    return file_list


def get_output_paths(file_list, output_folder, ext=".h5"):
    
    
    outputs = []
    for f in file_list:
        spl = os.path.normpath(f).split(os.path.sep)
        new_lastname = spl[-1].split('.')[0] + ext

        filepath = os.path.join(output_folder, spl[-3], spl[-2], new_lastname)
        outputs.append(filepath)
    return outputs
        

In [35]:
# Load the configuration
# conf = OmegaConf.load("./config.yaml")
query_audio_dir = conf.query_audio_dir
# query_audio_dir = 'test'
print(query_audio_dir)

output_folder = conf.query_embed_dir
print(output_folder)

# Find paths that need to be converted
# input_paths = find_paths(subset, input_path, ".ogg")

input_paths = find_files(query_audio_dir)
print(input_paths[0])
embed_paths = get_output_paths(input_paths, output_folder, ext=".h5")
print(embed_paths[0])

../data/sp/podcasts-no-audio-13GB/topic_task/query_audio
../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/8.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/2.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/3.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/1.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/4.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/5.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/7.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/6.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/8.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/2.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/3.ogg
../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/

In [36]:
THE_YAMNET_PATH = './yamnet_model/yamnet_class_map.csv'



# Run the YAMnet feature extraction
ex = YAMNetExtractor(logfile=os.path.join(conf.features_output_path, "log_yamnet"), yamnet_path = THE_YAMNET_PATH)
ex.embedding(input_paths, embed_paths)

init with yamnet path:  ./yamnet_model/yamnet_class_map.csv


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(


[del] will it be true:  False
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/8.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/slow/8.h5 True
[del] will it be true:  False
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/2.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/slow/2.h5 True
[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/3.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/slow/3.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/3.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[del] scores:  [[9.9473369e-01 9.2977143e-06 3.0056834e-03 ... 3.7579682e-09
  1.2792340e-06 1.8366392e-14]
 [9.7458792e-01 1.7461884e-05 4.0504336e-04 

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
 31%|███▏      | 5/16 [00:00<00:00, 11.69it/s]

[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/4.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/slow/4.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/4.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[del] scores:  [[9.98899102e-01 3.77660435e-05 2.65842676e-03 ... 9.57007956e-07
  6.74830007e-05 2.10534376e-14]
 [9.98663068e-01 1.03043192e-07 1.91807747e-04 ... 1.67743930e-09
  7.49076717e-05 7.20668039e-18]
 [9.89162028e-01 3.95685434e-04 1.33239329e-02 ... 1.06202824e-04
  6.34640455e-04 1.20161898e-08]
 [9.96349037e-01 3.60966660e-05 8.49747658e-03 ... 1.80073357e-07
  7.28791383e-06 3.10892771e-14]
 [9.98209417e-01 1.16149897e-06 1.44112110e-03 ... 1.61847552e-07
  1.67439630e-05 2.88205230e-14]
 [9.97505307e-01 2.93082885e-05 1.05380416e-02 ... 3.91282665e-05
  2.83509493e-04 1.1052210

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
 44%|████▍     | 7/16 [00:00<00:00, 10.26it/s]

[del] scores:  [[9.9801326e-01 2.6382857e-05 1.1540532e-02 ... 2.8749225e-09
  4.3114073e-06 7.1853199e-19]
 [9.9990159e-01 3.2593327e-11 3.2416825e-05 ... 2.7716506e-16
  3.2807668e-09 4.3076127e-31]
 [9.9966788e-01 2.3373878e-10 3.5276480e-06 ... 2.6522502e-19
  7.0183519e-11 7.5312308e-37]
 [9.9139631e-01 1.5272575e-05 1.9861460e-03 ... 2.2479719e-05
  4.7317147e-04 2.8412103e-10]]
[del] Will also save the embedding, to:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/slow/5.h5
[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/7.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/slow/7.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/slow/7.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[del] scores:  [[8.57699633e-01 9.91642475e-04 7.81118870e-03 ... 6.49094582e-04
  1.08525157e-0

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
 56%|█████▋    | 9/16 [00:00<00:00,  8.91it/s]

[del] scores:  [[8.8363254e-01 9.7396970e-04 1.1807680e-03 ... 4.8078746e-06
  3.5592914e-04 4.5808610e-11]
 [9.9623990e-01 2.6081654e-05 7.5882673e-04 ... 1.1123211e-08
  1.3887033e-05 3.2435522e-17]
 [9.9894249e-01 1.8069912e-06 1.2069643e-03 ... 1.0478054e-07
  5.2331699e-05 9.4013002e-16]
 [9.9947488e-01 4.3076061e-06 5.2052140e-03 ... 1.6302277e-08
  3.9387323e-06 6.0978957e-18]
 [9.9899781e-01 1.8252637e-06 4.4225454e-03 ... 7.4766069e-07
  6.0734674e-05 1.6118993e-15]]
[del] Will also save the embedding, to:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/slow/6.h5
[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/8.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/8.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/8.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[de

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
 62%|██████▎   | 10/16 [00:01<00:00,  7.53it/s]

[del] scores:  [[9.9952853e-01 3.2354035e-07 1.0428727e-03 ... 9.5575554e-11
  1.5478200e-06 2.5475272e-23]
 [9.9964201e-01 4.4706441e-10 2.3983728e-05 ... 7.1610865e-16
  3.6975829e-09 1.0464844e-31]
 [9.9993455e-01 6.7102150e-08 1.6756356e-03 ... 6.0860969e-12
  4.9556627e-07 1.7194005e-25]
 [9.9985445e-01 8.5006702e-10 2.0983815e-04 ... 2.8643706e-16
  1.3418701e-10 7.4317339e-32]
 [9.9989557e-01 3.7985908e-09 8.6978078e-04 ... 1.2777541e-11
  8.7906881e-07 1.1423916e-25]]
[del] Will also save the embedding, to:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/2.h5
[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/3.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/3.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/3.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[de

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
 75%|███████▌  | 12/16 [00:01<00:00,  7.56it/s]

[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/1.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/1.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/1.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[del] scores:  [[9.9871886e-01 2.7911262e-06 1.2889504e-03 ... 5.1797140e-09
  4.5286570e-06 2.3925018e-18]
 [9.9893486e-01 1.5224514e-05 7.9980195e-03 ... 2.4761434e-07
  2.7993321e-04 5.4072042e-16]
 [9.9928546e-01 9.8560904e-06 5.3869188e-03 ... 4.4705999e-07
  5.2108593e-05 2.0115748e-14]]
[del] Will also save the embedding, to:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/1.h5
[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/4.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/qu

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
 88%|████████▊ | 14/16 [00:01<00:00,  7.80it/s]

[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/5.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/5.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/5.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[del] scores:  [[9.9975640e-01 1.0451790e-10 1.8489361e-04 ... 1.3849552e-16
  1.4803180e-09 3.0010038e-35]
 [9.9993134e-01 4.9208992e-10 2.0450354e-04 ... 7.6150571e-17
  8.1239952e-11 7.3004811e-33]
 [9.9585891e-01 5.8103369e-06 5.8653951e-04 ... 3.0132394e-06
  5.9109926e-04 2.1330821e-12]]
[del] Will also save the embedding, to:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/5.h5
[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/7.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/qu

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->axis0] [items->None]

  df.astype(np.float16).to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_items] [items->None]

  df.astype(np.float16).to_hdf(
100%|██████████| 16/16 [00:01<00:00,  9.08it/s]

[del] will it be true:  True
Should exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/6.ogg True
Should not exist:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/6.h5 False
[del] if statement TRUE! reading  ../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/query/fast/6.ogg
[del] wav_data:  [[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]
[del] scores:  [[9.7387969e-01 9.2241168e-04 4.8935413e-03 ... 4.7194958e-04
  4.3959916e-03 8.2232168e-09]
 [9.9955332e-01 3.0627101e-07 3.8433075e-04 ... 2.3991627e-12
  2.2510739e-07 2.7455943e-24]
 [9.9921536e-01 1.5843457e-06 2.1637976e-03 ... 4.7739906e-08
  7.4129362e-05 1.3662801e-15]
 [9.9594426e-01 1.0232812e-05 2.7319640e-02 ... 1.5385886e-07
  1.0838929e-05 2.6275618e-16]]
[del] Will also save the embedding, to:  ../data/sp/podcasts-no-audio-13GB/topic_task/query_embedding/query/fast/6.h5





In [30]:

test_input_path = 'test/query/fast/8.ogg'

test2 = os.path.abspath(test_input_path)
print(test2)

# test2 = '../8.ogg'

# test2 = '../data/sp/podcasts-no-audio-13GB/topic_task/query_audio/description/slow/8.ogg'
wav_data, sr = sf.read(test2, dtype=np.int16)



# /Users/casper/Documents/UvA\ master/b23456_thesis/msc_thesis/code/data/sp/podcasts-no-audio-13GB/topic_task/query_audio/description/slow/8.ogg




/Users/casper/Documents/UvAmaster/b23456_thesis/msc_thesis/code/scripts/test/query/fast/8.ogg
