In [1]:
%load_ext autoreload
%autoreload 2

In [48]:
from vggish import mel_features
from vggish import vggish_input
from vggish import vggish_slim

import numpy as np
import inspect

import jina
from jina import Document, DocumentArray

### Prepare vvgish input manually

In [3]:
import librosa

In [4]:
x_audio, sample_rate = librosa.load('data/sample.mp3')



In [5]:
log_mel_examples = vggish_input.waveform_to_examples(x_audio, sample_rate)

In [6]:
log_mel_examples.shape

(28, 96, 64)

### We can directly go from path of mp3 file to input vggish

In [7]:
log_mel_examples = vggish_input.mp3_to_examples('data/sample.mp3')



In [134]:
log_mel_examples.shape

(28, 96, 64)

In [137]:
log_mel_examples = vggish_input.wavfile_to_examples('data/Beethoven_1.wav')

In [140]:
log_mel_examples.shape, log_mel_examples.flatten().shape

((19, 96, 64), (116736,))

## Segmenting the data

The segmenter in audio-example from jina examples does the following at segment time:

- uses `read_wav(path_to_wav)` reads a wav file and returns a numpy array (`data`) and a integer (`sample_rate`)

- 

To read the data the code uses `read_wav` to read the data from a path.
This function calls `soundfile.read` which returns a numpy array `data`.



Example for `data/Beethoven_1.wav`:

- data after `soundfile.read` has shape (806912,)

- `mel_data=wav2vel(data)` has shape (806912,)




In [174]:
import soundfile as sf

wav_data, sample_rate = sf.read('data/Beethoven_1.wav', dtype='int16')
print(wav_data.shape)
wav_data = np.mean(wav_data, axis=1)
data = wav_data / sample_rate
print(data.shape)

(806912, 2)
(806912,)


In [188]:
from vggish.vggish_input import waveform_to_examples

aux = waveform_to_examples(data, sample_rate)
print(aux.shape)

aux = waveform_to_examples(data, sample_rate).squeeze()
print(aux.shape)

(19, 96, 64)
(19, 96, 64)


In [195]:
print(inspect.getsource(segmenter.segment))

    def segment(self, docs, *args, **kwargs):

        for doc in docs:
            data, sample_rate = self.read_wav(doc.uri)
            mel_data = self.wav2mel(data, sample_rate)
            for idx, blob in enumerate(mel_data):
                #self.logger.debug(f'blob: {blob.shape}')
                doc.chunks.append(Document(offset=idx, weight=1.0, blob=blob))



Note that each chunk is a Document containing each element from mel_data.

In particular this will create as elements as mel_data.shape[0]

In [202]:
aux[0].shape

(96, 64)

### Finding matches in a dataset 

We have seen that given an audio input we create a numpy array `(n, 96, 64)` where `n` depends on the input audio.

At index time we will index each segment to a vector `(n, 96, 64)` will be converted to



```
(0, 96, 64)   ->  (96, 64)  -> embedding
(1, 96, 64)   ->  (96, 64)  -> embedding
...
(n-1, 96, 64) ->  (96, 64)  -> embedding
```


Then given a query with shape `(n_q, 96,64)` we will create an embedding for each chunk in the query


```
(0, 96, 64)     ->  (96, 64)  -> embedding
(1, 96, 64)     ->  (96, 64)  -> embedding
...
(n_q-1, 96, 64) ->  (96, 64)  -> embedding
```

Afterwards we will find for each chunk embedding which is its closest match


In [200]:
print(inspect.getsource(segmenter.read))

    @requests(on='/search')
    def read(self, docs, *args, **kwargs):
        for doc in docs:
            data, sample_rate = self.read_wav(doc.uri)
            mel_data = self.wav2mel(data, sample_rate)
            doc.blob = mel_data[0]



In [116]:
import executors

In [None]:
data_1 = vggish_input.wavfile_to_examples('data/Beethoven_1.wav')
data_2 = vggish_input.wavfile_to_examples('data/Beethoven_2.wav')
print(data_1.shape)
print(data_2.shape)

In [120]:
segmenter = executors.VggishSegmenter()

In [129]:
data, sample_rate = segmenter.read_wav('data/Beethoven_1.wav')
data.shape

(806912,)

In [141]:
data, sample_rate = segmenter.read_wav('data/Beethoven_2.wav')
data.shape

(1409024,)

### Passing data thorugh the vvgish

In [28]:
from executors import VggishEncoder

Note that different wav files will be represented with different numpy arrays:

In [86]:
data_1 = vggish_input.wavfile_to_examples('data/Beethoven_1.wav')
data_2 = vggish_input.wavfile_to_examples('data/Beethoven_2.wav')
print(data_1.shape)
print(data_2.shape)

(19, 96, 64)
(33, 96, 64)


Now let's define a VggishEncoder and look at how to encode the data

In [10]:
encoder = VggishEncoder()



INFO:tensorflow:Restoring parameters from /Users/davidbuchaca/Documents/git_stuff/neuralsearch/jina_2/audio-search/models/vggish_model.ckpt


In [42]:
inspect.getsourcefile(encoder._encode)

'/Users/davidbuchaca/Documents/git_stuff/neuralsearch/jina_2/audio-search/executors.py'

We can take a look at how data is encoded

In [40]:
print(inspect.getsource(encoder._encode))

    def _encode(self, docs: DocumentArray, *args, **kwargs):
        blobs = docs.get_attributes('blob')
        [embedding_batch] = self.sess.run([self.embedding_tensor],
                                           feed_dict={self.feature_tensor: blobs})
        result = self.post_processor.postprocess(embedding_batch)
        embedding_matrix = (np.float32(result) - 128.) / 128.
        
        for d,e in zip(docs, embedding_matrix):
            d.embedding = e



In [51]:
encoder._encode

<bound method VggishEncoder._encode of <executors.VggishEncoder object at 0x172aed850>>

In [103]:
d1 = Document()
d2 = Document()

d1.blob = data_1
d2.blob = data_2

darray = DocumentArray([d1,d2])

In [104]:
d1.blob.shape, d2.blob.shape

((19, 96, 64), (33, 96, 64))

In [105]:
aux = darray.get_attributes('blob')

In [106]:
encoder._encode(darray)

ValueError: setting an array element with a sequence.

In [85]:
#Document(data_1)

In [12]:
encoder.encode()

AttributeError: 'VggishEncoder' object has no attribute 'encode'