In [2]:
!pip install torch torchaudio transformers faiss-cpu librosa tqdm

Collecting faiss-cpu
  Using cached faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1


In [3]:
import os
import requests
from tqdm import tqdm

In [8]:
AUDIO_URLS = [
    "https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-Alaw-AFsp.wav",
    "https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-AlawWE-AFsp.wav",
    "https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-mulaw-AFsp.wav",
]

os.makedirs("audios", exist_ok=True)
audio_paths = []

for i, url in enumerate(tqdm(AUDIO_URLS, desc="Downloading audio files")):
    response = requests.get(url)
    if response.status_code == 200:
        path = f"audios/audio_{i}.wav"
        with open(path, "wb") as f:
            f.write(response.content)
        audio_paths.append(path)

Downloading audio files: 100%|██████████| 3/3 [00:00<00:00,  4.81it/s]


In [9]:
import torch
import torchaudio
from transformers import Wav2Vec2Model, Wav2Vec2Processor

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "facebook/wav2vec2-base-960h"

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import librosa
import numpy as np

def get_audio_embedding(path):
    waveform, sr = librosa.load(path, sr=16000)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # shape: (batch, time, features)
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  # mean pooling
    return embedding


In [11]:
import faiss

embeddings = []
for path in tqdm(audio_paths, desc="Embedding audios"):
    vec = get_audio_embedding(path)
    embeddings.append(vec)

embeddings = np.stack(embeddings).astype("float32")

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "audio_index.faiss")
with open("audio_paths.txt", "w") as f:
    f.writelines([p + "\n" for p in audio_paths])


Embedding audios: 100%|██████████| 3/3 [00:13<00:00,  4.65s/it]


In [12]:
def search_similar_audio(query_path, top_k=3):
    index = faiss.read_index("audio_index.faiss")
    with open("audio_paths.txt", "r") as f:
        paths = [line.strip() for line in f.readlines()]

    query_vec = get_audio_embedding(query_path).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vec, top_k)

    print(f"Query audio: {query_path}")
    print("Top matches:")
    for idx, dist in zip(indices[0], distances[0]):
        print(f"{paths[idx]} - Distance: {dist}")


In [13]:
search_similar_audio("audios/audio_1.wav")


Query audio: audios/audio_1.wav
Top matches:
audios/audio_0.wav - Distance: 0.0
audios/audio_1.wav - Distance: 0.0
audios/audio_2.wav - Distance: 0.03705073148012161
