In [1]:
import soundcard as sc
import soundfile as sf
import cv2
import numpy as np
import time
import keyboard

import torch
import torchaudio

from matplotlib import pyplot as plt

SAMPLE_RATE = 48000 # [Hz]. sampling rate.

duration = 200  # Duration of the audio stream in seconds
window_size = 1024  # Size of the FFT window
overlap = 512  # Number of samples to overlap between consecutive windows

with sc.get_microphone(id=str(sc.default_speaker().name), include_loopback=True).recorder(samplerate=SAMPLE_RATE) as mic:
    for i in range(duration):
        data = mic.record(numframes=window_size)
        data = torch.from_numpy(data).permute(1, 0)

        print(f"{i} of {duration}", end="\r")

        audio_spectogram = torchaudio.transforms.Spectrogram()(data)
        audio_spectogram = audio_spectogram.log2()[0,:,:].numpy()
        audio_spectogram -= audio_spectogram.min()
        audio_spectogram /= audio_spectogram.max()

        cv2.imshow('stream', audio_spectogram)

        k = cv2.waitKey(33)
        if k==27:    # Esc key to stop
            cv2.destroyAllWindows()
            break
        
        # # closing all open windows
    cv2.destroyAllWindows()

77 of 200

In [None]:
import librosa 
import librosa.display

import IPython.display as ipd

In [None]:
sound_path = "space3.wav"

In [None]:
ipd.Audio(sound_path)

In [None]:
sound, sr = librosa.load(sound_path, mono=True)

In [None]:
# sound duration:
duration = len(sound) / sr 
print(f"duration: {duration:3.3f}")

In [None]:
# waveform visualization
print("yo")


In [None]:
### converting form mp3 to wav

from os import path
from pydub import AudioSegment

# files                                                                         
src = r"C:\Users\Admin\Downloads\MediaHuman\Music\Thip Trong - Lightvessel.mp3"
dst = "test.wav"

# convert wav to mp3                                                            
to_wav = AudioSegment.from_mp3(src)
to_wav.export(dst, format="wav")

In [None]:
sound_path = "test.wav"
sound, sr = librosa.load(sound_path, mono=True)

sound = sound[:90000]

In [None]:
fig, ax = plt.subplots(nrows=1, sharex=True, sharey=True)
ax.set_ylim((-1.2, 1.2))
librosa.display.waveshow(sound, sr=sr, ax=ax)

In [None]:
print(sound.shape)

In [None]:
FRAME_SIZE = 1024
HOP_LENGTH = 512

def amplitude_envelope(signal, frame_size, hop_length):
    amplitude_envelope = []

    for i in range(0, len(signal), hop_length):
        current_frame_ae = max(signal[i:i+frame_size]) 
        amplitude_envelope.append(current_frame_ae)

    return np.array(amplitude_envelope)

def amplitude_envelope_np(signal, frame_size, hop_length):
    return np.array([np.max(signal[i:i+frame_size]) for i in range(0, len(signal), hop_length)])

In [None]:
ae_test = amplitude_envelope_np(sound, FRAME_SIZE, HOP_LENGTH)

In [None]:
print(ae_test.shape)

In [None]:
frames = range(0, ae_test.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

fig, ax = plt.subplots(nrows=1, sharex=True, sharey=True)
ax.set_ylim((-1.2, 1.2))
librosa.display.waveshow(sound, sr=sr, ax=ax)
plt.plot(t, ae_test, color='r')

In [None]:
rms_test = librosa.feature.rms(y=sound, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0]

def my_rms(signal, frame_size, hop_lenegth):
    ae = []
    for i in range(0, len(signal), hop_lenegth):
        current = signal[i:i+frame_size]
        current = np.sqrt(np.mean(current**2))
        ae.append(current)

    return np.array(ae)

rms_test_my = my_rms(sound, FRAME_SIZE, HOP_LENGTH)

print(rms_test.shape, rms_test_my.shape)

In [None]:
frames = range(0, rms_test.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

fig, ax = plt.subplots(nrows=1, sharex=True, sharey=True)
ax.set_ylim((-1.2, 1.2))
librosa.display.waveshow(sound, sr=sr, ax=ax)
plt.plot(t, rms_test, color='r')
plt.plot(t, rms_test_my, color='g')

In [None]:
zcr_test = librosa.feature.zero_crossing_rate(y=sound, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
zcr_test_my = [ np.sum(np.abs(np.diff(sound[i:i+FRAME_SIZE] > 0)) > 0) / FRAME_SIZE   for i in range(0, len(sound), HOP_LENGTH)]





In [None]:
print(zcr_test[1:21])
print(zcr_test_my[:20])

print((np.array(zcr_test[1:101]) / np.array(zcr_test_my[:100])))

In [None]:
frames = range(0, rms_test.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

fig, ax = plt.subplots(nrows=1, sharex=True, sharey=True)
ax.set_ylim((-1.2, 1.2))
librosa.display.waveshow(sound, sr=sr, ax=ax)
plt.plot(t, rms_test, color='r')
plt.plot(t, zcr_test, color='g')

In [2]:
class RealTimeAudioStream:
    def __init__(self, sample_rate = 44100, window_size = 1024, overlap = 512, buffer_seconds = 5, cv2_window_size = (256, 512)):
        
        self.cv2_window_size = cv2_window_size # (H, W)

        self.done = None
        self.current_rms = None
        self.current_zcr = None

        self._d =  torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.sample_rate = (sample_rate // window_size) * window_size # samples per seconds, a.k.a [Hz]
        print(f"RealTimeAudioStream initialized with {self.sample_rate} sample rate")
        self.window_size = window_size # samples per processsing step
        self.overlap = overlap # overlap

        self._mic = sc.get_microphone(id=str(sc.default_speaker().name), include_loopback=True)
        self._num_channels = self._mic.channels

        self._buffer_size = self.sample_rate * buffer_seconds # samples memory size
        self._buffer_wav = torch.zeros((self._buffer_size, self._num_channels), dtype=float, device=self._d)

        self.buffer_rms = torch.zeros((self._buffer_size // overlap, self._num_channels), dtype=float, device=self._d)
        self.buffer_zcr = torch.zeros((self._buffer_size // overlap, self._num_channels), dtype=float, device=self._d)

    def _rms(self):
        current = self._buffer_wav[-self.window_size:, :]
        self.current_rms = current.pow(2).mean(0, keepdim=True).sqrt()
        self.buffer_rms = torch.cat((self.buffer_rms, self.current_rms), dim=0)[1:,:]

    def _zcr(self):
        current = self._buffer_wav[-self.window_size:, :]
        self.current_zcr = (torch.diff(current > 0, dim=0).type(torch.int).abs() > 0).sum(dim=0, keepdim=True) / self.window_size
        self.buffer_zcr = torch.cat((self.buffer_zcr, self.current_zcr), dim=0)[1:,:]

    def _vis(self):

        show_R = self.buffer_rms
        show_G = self.buffer_zcr

        W1, C = show_R.shape

        H, W = self.cv2_window_size
        sound_R = torch.clamp(  H - ((show_R + 1) * H // 2)    , 0, H-1).type(torch.LongTensor)[:,0] # [W1]
        sound_G = torch.clamp(  H - ((show_G + 1) * H // 2)    , 0, H-1).type(torch.LongTensor)[:,0] # [W1]
        image = torch.zeros((H, W1, 3), dtype=float) # [H, W1]

        image[sound_R, torch.arange(0, W1), 2] = 1
        image[sound_G, torch.arange(0, W1), 1] = 1

        image = cv2.resize(image.cpu().numpy(), (W, H))

        return image

    def step(self, mic):
        self._current = torch.from_numpy(mic.record(numframes=self.overlap)).to(self._d) # [window_size, num_channels] ~ [1024, 2]

        self._buffer_wav = torch.cat((self._buffer_wav, self._current), dim=0)[self.overlap:,:]
        self._rms()
        self._zcr()

        keyboard.on_press_key("ESC", lambda _: self._done())

        return self.current_rms, self.current_zcr
    
    def get_recorder(self):
        return self._mic.recorder(samplerate=self.sample_rate)
    
    def _done(self):
        self.done = True

    def stream(self):
        self.done = False

        with self.get_recorder() as mic:
            while not self.done:
                self.step(mic)
                cv2.imshow('stream', self._vis())

                k = cv2.waitKey(33)
                if k==27:    # Esc key to stop
                    self.done = True
                    cv2.destroyAllWindows()
                    break

            cv2.destroyAllWindows()

In [3]:
audio_stream = RealTimeAudioStream()

with audio_stream.get_recorder() as mic:
    while not audio_stream.done:
        out = audio_stream.step(mic)

RealTimeAudioStream initialized with 44032 sample rate


In [None]:
audio_stream._buffer_wav.shape

In [None]:
frames = range(0, audio_stream.buffer_rms.shape[0])
t = librosa.frames_to_time(frames, hop_length=audio_stream.overlap//2)

fig, ax = plt.subplots(nrows=1, sharex=True, sharey=True)
ax.set_ylim((-1.2, 1.2))
librosa.display.waveshow(audio_stream._buffer_wav[:,0], sr=audio_stream.sample_rate, ax=ax)
plt.plot(t, audio_stream.buffer_rms[:,0], color='r')
plt.plot(t, audio_stream.buffer_zcr[:,0], color='g')