# Wav2vec2 embeddings distance

We test the performance of using wav2vec2 embeddings over a siamese network to classify two audios as being the same word

In [None]:
#| eval: false
from IPython.display import display, Audio
from glob import glob
import json
from datasets import load_dataset, Features, Value, Audio, ClassLabel
import random


We already trained and run a model. The results over the test set are in the `emb_sim` folder.  
We precomputed the wav2vec2 embeddings for all the audios. The siamese network was fed pairs of embeddings and predicted if the audios had the same work spoken or not.  
In the `emb_sim` folder we saved the results for the testing dataset. For each audio pair in the test dataset, if the model judges the pair had the same word spoken, we saved both audios under a subfolder.  
The subfolder is the transcripted word for the first audio in the pair.
It is important to note that in this test, we compared pairs of audio the model didnt train on. We expect better results if we compare against trained audios.

In [None]:
#| eval: false
from pathlib import Path


correct_test_files_same = glob('data/panda/results/emb_sim/correct_same/*/*.wav')
incorrect_test_files_same = glob('data/panda/results/emb_sim/incorrect_same/*/*.wav')
test_files_same = len(correct_test_files_same) + len(incorrect_test_files_same)
print(f"Saved same test audios {test_files_same}")
print(f"Correctly classified audios: {len(correct_test_files_same)} - {round(len(correct_test_files_same)/test_files_same*100, 2)}%")
print(f"Incorrectly classified audios: {len(incorrect_test_files_same)} - {round(len(incorrect_test_files_same)/test_files_same*100, 2)}%")

Saved same test audios 2119
Correctly classified audios: 1395 - 65.83%
Incorrectly classified audios: 724 - 34.17%


In [None]:
#| eval: false
from pathlib import Path


correct_test_files_dif = glob('data/panda/results/emb_sim/correct_dif/*/*.wav')
incorrect_test_files_dif = glob('data/panda/results/emb_sim/incorrect_dif/*/*.wav')
test_files_dif = len(correct_test_files_dif) + len(incorrect_test_files_dif)
print(f"Saved diff test audios {test_files_dif}")
print(f"Correctly classified audios: {len(correct_test_files_dif)} - {round(len(correct_test_files_dif)/test_files_dif*100, 2)}%")
print(f"Incorrectly classified audios: {len(incorrect_test_files_dif)} - {round(len(incorrect_test_files_dif)/test_files_dif*100, 2)}%")

Saved diff test audios 5468
Correctly classified audios: 4281 - 78.29%
Incorrectly classified audios: 1187 - 21.71%


## Sample of correct examples found
### Same pairs

In [None]:
#| eval: false
import random
import numpy
from IPython.display import display, Audio
from scipy.io.wavfile import read

for _ in range(10):
    rand_idx = random.randint(0, len(correct_test_files_same)-1)
    example = correct_test_files_same[rand_idx]
    display(Audio(data=example))
    

### Different pairs

In [None]:
#| eval: false
import random
import numpy
from IPython.display import display, Audio
from scipy.io.wavfile import read

for _ in range(10):
    rand_idx = random.randint(0, len(correct_test_files_dif)-1)
    example = correct_test_files_dif[rand_idx]
    display(Audio(data=example))
    

## Sample of incorrect examples found
### Same pairs

In [None]:
#| eval: false
import random
import numpy
from IPython.display import display, Audio
from scipy.io.wavfile import read

for _ in range(10):
    rand_idx = random.randint(0, len(incorrect_test_files_same)-1)
    example = incorrect_test_files_same[rand_idx]
    display(Audio(data=example))
    

### Different pairs

In [None]:
#| eval: false
import random
import numpy
from IPython.display import display, Audio
from scipy.io.wavfile import read

for _ in range(10):
    rand_idx = random.randint(0, len(incorrect_test_files_dif)-1)
    example = incorrect_test_files_dif[rand_idx]
    display(Audio(data=example))
    