In [None]:
!pip install torch torchaudio transformers faiss-cpu librosa tqdm

In [None]:
import os
import requests
from tqdm import tqdm

In [None]:
AUDIO_URLS = [
    "https://www2.cs.uic.edu/~i101/SoundFiles/StarWars60.wav",
    "https://www2.cs.uic.edu/~i101/SoundFiles/BabyElephantWalk60.wav",
    "https://www2.cs.uic.edu/~i101/SoundFiles/CantinaBand60.wav",
]

os.makedirs("audios", exist_ok=True)
audio_paths = []

for i, url in enumerate(tqdm(AUDIO_URLS, desc="Downloading audio files")):
    response = requests.get(url)
    if response.status_code == 200:
        path = f"audios/audio_{i}.wav"
        with open(path, "wb") as f:
            f.write(response.content)
        audio_paths.append(path)

In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2Model, Wav2Vec2Processor

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "facebook/wav2vec2-base-960h"

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)


In [None]:
import librosa
import numpy as np

def get_audio_embedding(path):
    waveform, sr = librosa.load(path, sr=16000)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # shape: (batch, time, features)
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  # mean pooling
    return embedding


In [None]:
import faiss

embeddings = []
for path in tqdm(audio_paths, desc="Embedding audios"):
    vec = get_audio_embedding(path)
    embeddings.append(vec)

embeddings = np.stack(embeddings).astype("float32")

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "audio_index.faiss")
with open("audio_paths.txt", "w") as f:
    f.writelines([p + "\n" for p in audio_paths])


In [None]:
def search_similar_audio(query_path, top_k=3):
    index = faiss.read_index("audio_index.faiss")
    with open("audio_paths.txt", "r") as f:
        paths = [line.strip() for line in f.readlines()]

    query_vec = get_audio_embedding(query_path).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vec, top_k)

    print(f"Query audio: {query_path}")
    print("Top matches:")
    for idx, dist in zip(indices[0], distances[0]):
        print(f"{paths[idx]} - Distance: {dist}")


In [None]:
search_similar_audio("audios/audio_1.wav")
