In [None]:
!pip install sounddevice soundfile numpy librosa onnxruntime



In [8]:
import os
import time
import queue
import threading
import numpy as np
import sounddevice as sd
import soundfile as sf
import librosa
import onnxruntime as ort

# === Constants ===
NUM_CLASSES = 206
EMB_DIM = 2048
MEL_SHAPE = (1, 64, 313)
WAV_LEN = 320000
FUSION_DIM = NUM_CLASSES * 4
SAMPLE_RATE = 16000
CHUNK_DURATION = 10
INPUT_LEN = SAMPLE_RATE * CHUNK_DURATION
NUM_TRIALS = 10  # Number of inference trials per chunk

# === ONNX Model Paths ===
MODEL_PATHS = {
    "embedding":    "embedding_quantized.onnx",
    "resnet":       "resnet_quantized.onnx",
    "efficientnet": "effnet_quantized.onnx",
    "rawaudio":     "rawaudio_quantized.onnx",
    "meta":         "meta_quantized.onnx"
}

# === Load ONNX Sessions ===
sessions = {k: ort.InferenceSession(v, providers=["CPUExecutionProvider"]) for k, v in MODEL_PATHS.items()}

# === Normalize waveform ===
def normalize(wav):
    mean = wav.mean()
    std = wav.std() if wav.std() > 1e-6 else 1e-6
    return (wav - mean) / std

# === Extract mel spectrogram ===
def extract_mel_tensor(wav, height=64, width=313):
    mel = librosa.feature.melspectrogram(y=wav, sr=SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=height)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)

    if mel_norm.shape[1] < width:
        mel_norm = np.pad(mel_norm, ((0, 0), (0, width - mel_norm.shape[1])))
    else:
        mel_norm = mel_norm[:, :width]

    return mel_norm.astype(np.float32).reshape(1, 1, height, width)

# === Extract pooled embedding from mel ===
def extract_embedding_from_mel(mel_tensor):
    pooled = mel_tensor.squeeze(0).squeeze(0).mean(axis=1)
    emb = np.pad(pooled, (0, EMB_DIM - pooled.shape[0]), mode='constant')  # zero-pad to EMB_DIM
    return emb.reshape(1, -1).astype(np.float32)

# === Inference over multiple trials with timing ===
def run_full_inference(wav_chunk, num_trials=NUM_TRIALS):
    wav = normalize(wav_chunk).astype(np.float32)
    if wav.shape[0] < WAV_LEN:
        wav = np.pad(wav, (0, WAV_LEN - wav.shape[0]))
    wav = wav[:WAV_LEN].reshape(1, -1)

    mel_tensor = extract_mel_tensor(wav_chunk, height=MEL_SHAPE[1], width=MEL_SHAPE[2])
    mel_aug_tensor = mel_tensor.copy()
    emb = extract_embedding_from_mel(mel_tensor)

    input_dicts = {
        "embedding":    {sessions["embedding"].get_inputs()[0].name: emb},
        "resnet":       {sessions["resnet"].get_inputs()[0].name: mel_aug_tensor},
        "efficientnet": {sessions["efficientnet"].get_inputs()[0].name: mel_tensor},
        "rawaudio":     {sessions["rawaudio"].get_inputs()[0].name: wav}
    }

    latencies = []
    for _ in range(num_trials):
        t0 = time.time()
        emb_out = sessions["embedding"].run(None, input_dicts["embedding"])[0]
        res_out = sessions["resnet"].run(None, input_dicts["resnet"])[0]
        eff_out = sessions["efficientnet"].run(None, input_dicts["efficientnet"])[0]
        raw_out = sessions["rawaudio"].run(None, input_dicts["rawaudio"])[0]

        fused = np.concatenate([emb_out, res_out, eff_out, raw_out], axis=1).astype(np.float32)
        _ = sessions["meta"].run(None, {sessions["meta"].get_inputs()[0].name: fused})[0]
        latencies.append((time.time() - t0) * 1000)

    # Final prediction (use last run)
    meta_out = sessions["meta"].run(None, {sessions["meta"].get_inputs()[0].name: fused})[0]

    return meta_out, {
        "median_ms": np.median(latencies),
        "mean_ms": np.mean(latencies),
        "p95_ms": np.percentile(latencies, 95),
        "fps": 1000.0 / np.mean(latencies)
    }

# === Record audio until ENTER ===
print("Recording... Press ENTER to stop.")
audio_q = queue.Queue()
flag = {"stop": False}

def key_listener():
    input()
    flag["stop"] = True

def audio_callback(indata, frames, time_info, status):
    audio_q.put(indata.copy())

threading.Thread(target=key_listener, daemon=True).start()

buffer = []
with sd.InputStream(callback=audio_callback, channels=1, samplerate=SAMPLE_RATE):
    while not flag["stop"]:
        buffer.append(audio_q.get())

audio = np.concatenate(buffer, axis=0).squeeze()
sf.write("recorded.wav", audio, SAMPLE_RATE)
print("Recording saved to recorded.wav")

# === Split into 10s chunks ===
chunks = []
for i in range(0, len(audio), INPUT_LEN):
    clip = audio[i:i+INPUT_LEN]
    if len(clip) < INPUT_LEN:
        clip = np.pad(clip, (0, INPUT_LEN - len(clip)))
    chunks.append(clip)

print(f"Running inference on {len(chunks)} chunks...")

# === Inference per chunk ===
for idx, clip in enumerate(chunks):
    meta_out, stats = run_full_inference(clip)
    pred = int(np.argmax(meta_out))
    conf = float(np.max(meta_out))
    print(
        f"[Chunk {idx+1}] Class: {pred} | Conf: {conf:.4f} | "
        f"Median Latency: {stats['median_ms']:.2f}ms | "
        f"Mean Latency: {stats['mean_ms']:.2f}ms | "
        f"95th Percentile Latency: {stats['p95_ms']:.2f}ms | "
        f"FPS: {stats['fps']:.2f} FPS"
    )


Recording... Press ENTER to stop.
Recording saved to recorded.wav
Running inference on 1 chunks...
[Chunk 1] Class: 97 | Conf: 0.4661 | Median Latency: 74.23ms | Mean Latency: 90.33ms | 95th Percentile Latency: 161.13ms | FPS: 11.07 FPS
