In [132]:
import torch
from transformers import Wav2Vec2Model, Wav2Vec2Tokenizer
from pydub import AudioSegment
import librosa
import numpy as np

import json
import io
from tqdm import tqdm

In [3]:
# Load the wav2vec model and tokenizer
model = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base')
tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base')

Downloading: 100%|███████████████████████████████████████████████████████████████████████| 1.84k/1.84k [00:00<00:00, 598kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████| 380M/380M [01:03<00:00, 5.96MB/s]
Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2Model: ['project_q.weight', 'project_q.bias', 'project_hid.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_hid.bias', 'quantizer.codevectors']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloadi

In [4]:
dataset_path = "../data/pokemon_data.json"

In [5]:
def load_json(path):
    # Opening JSON file
    with open(path, 'r') as openfile:
        # Reading from json file
        json_object = json.load(openfile)
        return json_object
    
def save_json(object_, path):
    # Serializing json
    json_object = json.dumps(object_, indent=4)
    
    # Writing to sample.json
    with open(path, "w") as outfile:
        outfile.write(json_object)

In [6]:
all_pokemon_data = load_json(dataset_path)

In [131]:
# generate embeddings for all pokedex entries
# compare pokedex entry embeddings
all_embeddings = {}
max_len = 32000
model.eval()
for pid in tqdm(all_pokemon_data):
    try:
        sound_data, s = librosa.load("../data/client_data/cries/%s.mp3"%pid, sr=16000)
    except:
        continue
        
    mfcc = librosa.feature.mfcc(y=sound_data, sr=s)
    all_embeddings[pid] = mfcc
    continue
    
    try:
        # Load the MP3 file
        mp3_file = AudioSegment.from_mp3("../data/client_data/cries/%s.mp3"%pid)
    except:
        continue

    # Convert the MP3 file to WAV format and store the data in a BytesIO object
    wav_data = io.BytesIO()
    mp3_file.export(wav_data, format="wav")

    # Seek to the beginning of the BytesIO object
    wav_data.seek(0)
    sound_data, s = librosa.load(wav_data, sr=16000)
    # sound_data = sound_data[:max_len]
    
    mfcc = librosa.feature.mfcc(y=sound_data, sr=s)
    all_embeddings[pid] = mfcc
    continue
    
    # Encode the sound file data using the tokenizer
    input_ids = tokenizer(
        sound_data, 
        return_tensors = "pt", 
        max_length=max_len, 
        padding="max_length",
        truncation=True
    ).input_values

    with torch.no_grad():
        # Extract the features from the sound file using the wav2vec model
        features = model(input_ids)["extract_features"].mean(dim=2)
                              
    all_embeddings[pid] = features

  return f(*args, **kwargs)
100%|██████████████████████████████████████████████████████████████████████████████████████| 899/899 [00:29<00:00, 30.67it/s]


In [155]:
# compare pokedex entry embeddings
all_similarities = {}
for pid1 in tqdm(all_embeddings):
    sound1_embedding = all_embeddings[pid1]
    
    similarities = []
    for pid2 in all_embeddings:
        if pid1 == pid2:
            continue
        sound2_embedding = all_embeddings[pid2]
        
        # Calculate the cosine similarity between the embeddings
        cosine_similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
        with torch.no_grad():
            # Use DTW to align the MFCCs
            matrix, wp = librosa.sequence.dtw(sound1_embedding, sound2_embedding, backtrack=True)
            cost = matrix[-1][-1]
            # print(len(wp))
            # print(cost / len(wp))
            # print(matrix[-1][-])

            # Compute the similarity score as the sum of the alignment costs along the optimal path
            similarity = cost / len(wp)
        similarities.append((pid2, similarity))
    
    similarities.sort(key = lambda x: x[1], reverse=True)
    all_similarities[pid1] = similarities

  1%|█                                                                                      | 11/898 [00:01<02:25,  6.11it/s]


KeyboardInterrupt: 

In [45]:
save_json(all_similarities, "../data/client_data/cry_similarities.json")

In [156]:
# explore results
import IPython.display as display

chosen_pid = "1"
display.display(display.Audio("../data/client_data/cries/%s.mp3"%chosen_pid, autoplay=False))
print("========")
for pid, similarity in all_similarities[chosen_pid][:20]:
    print(pid, similarity)
    display.display(display.Audio("../data/client_data/cries/%s.mp3"%pid, autoplay=False))
    print()

821 397.4055004650312



741 295.4388838782402



670 283.0808662318921



682 277.47141213953586



492 263.14926248098647



824 259.7858913634579



669 256.7165521885562



802 250.23535283046027



856 250.07194397272627



759 248.56891333570556



509 248.5009587164969



885 244.58151823752777



572 239.9458783426702



397 238.20210559463246



494 235.9534525782992



398 235.80538437560222



674 234.97848539394866



822 228.95684313551035



698 226.36025319909092



829 226.12196031912





In [79]:
display.display(display.Audio("../data/client_data/cries/%s.mp3"%chosen_pid, autoplay=False))
display.display(display.Audio("../data/client_data/cries/%s.mp3"%chosen_pid, autoplay=False))