In [38]:
import torchaudio
AUDIO_FILE = '../Data/LibriSpeech/train-clean-wav/19-198-0008.flac'
samples, samplerate = torchaudio.load(AUDIO_FILE)
print("sample :",samples,samples.shape)
print("sample rate :",samplerate)

sample : tensor([[ 0.0041,  0.0032,  0.0022,  ...,  0.0003, -0.0002, -0.0006]]) torch.Size([1, 45760])
sample rate : 16000


In [39]:
import os
import IPython
import scipy
import soundfile as sf
import numpy as np
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as idp
# import matplotlib.pyplot as plt

# %matplotlib inline
TRAIN_PATH='../Data/LibriSpeech/train-clean-wav/'
idp.Audio(TRAIN_PATH + "103-1240-0003.flac")

In [40]:
audio, sample_rate, = sf.read(TRAIN_PATH + "103-1240-0006.flac")
print("Sample rate: {0}Hz".format(sample_rate))
print("Audio duration: {0}s".format(len(audio) / sample_rate))
print("Samples : ",audio)
print("abs",np.max(np.abs(audio)))

Sample rate: 16000Hz
Audio duration: 9.58s
Samples :  [-0.0007019   0.00015259  0.00128174 ...  0.00650024  0.00619507
  0.00692749]
abs 0.33172607421875


In [41]:
def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio
audio = normalize_audio(audio)
# plt.figure(figsize=(12,3))
# plt.plot(np.linspace(0, len(audio) / sample_rate, num=len(audio)), audio)
# plt.grid(True)

In [42]:
def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
    # hop_size in ms
    
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames

In [43]:
hop_size = 15 #ms
FFT_size = 2048

audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
print("Framed audio shape: {0}".format(audio_framed.shape))
print(audio_framed)

Framed audio shape: (639, 2048)
[[ 0.00423183  0.00800368  0.01674333 ...  0.02943882  0.01674333
   0.00800368]
 [ 0.01122355  0.01113155  0.00321987 ...  0.03063477  0.03339466
   0.02079117]
 [ 0.0101196   0.01287948  0.00533579 ...  0.01131555  0.00662374
   0.00340386]
 ...
 [ 0.0124195   0.01131555  0.0126035  ... -0.01039558 -0.00910764
  -0.01048758]
 [ 0.00193192  0.00266789  0.00036799 ...  0.00800368  0.0075437
   0.00579577]
 [-0.01076357 -0.01048758 -0.01085557 ...  0.01389144  0.01444342
   0.0124195 ]]


In [44]:
window = get_window("hann", FFT_size, fftbins=True)
# plt.figure(figsize=(12,3))
# plt.plot(window)
# plt.grid(True)
audio_win = audio_framed * window

ind = 69
# plt.figure(figsize=(12,3))
# plt.subplot(2, 1, 1)
# plt.plot(audio_framed[ind])
# plt.title('Original Frame')
# plt.grid(True)
# plt.subplot(2, 1, 2)
# plt.plot(audio_win[ind])
# plt.title('Frame After Windowing')
# plt.grid(True)

In [45]:
audio_winT = np.transpose(audio_win)
audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

for n in range(audio_fft.shape[1]):
    audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

audio_fft = np.transpose(audio_fft)


In [46]:
audio_power = np.square(np.abs(audio_fft))
print(audio_power)
print(audio_power.shape)

[[2.3693626e+00 3.8871725e+00 4.6416225e+00 ... 3.1426505e-04
  1.1470420e-04 3.3645019e-06]
 [1.4026076e+00 3.3953984e+00 3.5005693e+00 ... 1.2411583e-04
  2.6415469e-04 2.7761693e-05]
 [7.9567008e-02 2.2812536e+00 1.4054101e+00 ... 5.9289356e-05
  6.8540784e-04 3.7533240e-04]
 ...
 [3.7661681e+00 7.1543294e-01 1.1437394e-01 ... 3.9930022e-04
  4.6464771e-05 2.7861638e-04]
 [4.9928184e+00 1.3010023e+00 5.2544659e-01 ... 8.5882557e-04
  2.7218612e-04 6.8204576e-04]
 [5.9080954e+00 2.4015927e+00 1.0252838e+00 ... 1.4329868e-03
  7.2903273e-04 1.0049335e-03]]
(639, 1025)


In [47]:
freq_min = 0
freq_high = sample_rate / 2
mel_filter_num = 10

print("Minimum frequency: {0}".format(freq_min))
print("Maximum frequency: {0}".format(freq_high))

Minimum frequency: 0
Maximum frequency: 8000.0


In [48]:
def freq_to_mel(freq):
    return 2595.0 * np.log10(1.0 + freq / 700.0)

def met_to_freq(mels):
    return 700.0 * (10.0**(mels / 2595.0) - 1.0)

In [49]:
def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    
    print("MEL min: {0}".format(fmin_mel))
    print("MEL max: {0}".format(fmax_mel))
    
    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs
filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)
print(filter_points)
print(mel_freqs)


MEL min: 0.0
MEL max: 2840.023046708319
[  0   8  18  32  48  69  96 129 170 223 288 371]
[   0.          180.21928115  406.83711843  691.7991039  1050.12629534
 1500.70701371 2067.29249375 2779.74887082 3675.63149949 4802.16459006
 6218.73051459 8000.        ]


In [50]:
def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters
filters = get_filters(filter_points, FFT_size)

# plt.figure(figsize=(12,3))
# for n in range(filters.shape[0]):
#     plt.plot(filters[n])

In [51]:
enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
filters *= enorm[:, np.newaxis]
# plt.figure(figsize=(15,4))
# for n in range(filters.shape[0]):
#     plt.plot(filters[n])

In [52]:
audio_filtered = np.dot(filters, np.transpose(audio_power))
audio_log = 10.0 * np.log10(audio_filtered)
audio_log.shape


(10, 639)

In [53]:
def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis
dct_filter_num = 40

dct_filters = dct(dct_filter_num, mel_filter_num)

cepstral_coefficents = np.dot(dct_filters, audio_log)
cepstral_coefficents.shape

(40, 639)

In [54]:
print(audio_log.shape)
print(dct_filters.shape)

(10, 639)
(40, 10)


In [55]:
cepstral_coefficents[:, [490,500]]

array([[-1.75987542e+01, -5.83986895e-01],
       [ 1.88503422e+01,  4.93746517e+00],
       [-2.12419455e+01, -1.41421112e+01],
       [-5.43431366e-01,  3.74440910e+00],
       [-2.09981527e-01,  3.41614556e+00],
       [ 6.61745821e+00,  1.86719658e+00],
       [ 7.30577697e-01, -2.84829089e+00],
       [ 8.61695090e+00,  4.59704696e+00],
       [ 1.14360941e+01, -2.04001106e+00],
       [ 4.23579766e+00,  1.15046905e+00],
       [ 5.30850841e-14,  1.09818847e-14],
       [-4.23579766e+00, -1.15046905e+00],
       [-1.14360941e+01,  2.04001106e+00],
       [-8.61695090e+00, -4.59704696e+00],
       [-7.30577697e-01,  2.84829089e+00],
       [-6.61745821e+00, -1.86719658e+00],
       [ 2.09981527e-01, -3.41614556e+00],
       [ 5.43431366e-01, -3.74440910e+00],
       [ 2.12419455e+01,  1.41421112e+01],
       [-1.88503422e+01, -4.93746517e+00],
       [ 2.48883968e+01,  8.25882187e-01],
       [-1.88503422e+01, -4.93746517e+00],
       [ 2.12419455e+01,  1.41421112e+01],
       [ 5.