In [1]:
import librosa
import numpy as np
import sys
import pickle
import time
import IPython.display as ipd
from codec import encode, decode
import scipy.signal as signal
from scipy.fftpack import fft, ifft, dct, idct
from scipy.signal import butter, lfilter
import matplotlib.pyplot as plt
import CosineTransformTools as ctt 

In [2]:
def load_cd_quality_audio(filename):
    audio, sr = librosa.load(filename, sr=44100, dtype='float_')
    max_int_value = 2**15 - 1
    audio *= max_int_value
    audio = audio.astype('int16')
    return audio
    
x = load_cd_quality_audio("taxman.wav")
fs = 44100

# ipd.Audio(x, rate=44100)

In [3]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [4]:
def divide_into_frames(signal, npoints):
    # Find out how many frames
    nframes = np.ceil(len(signal)/npoints).astype('int')
    # Pad the signal so that its length is divisible by nframes
    padded = np.pad(signal, (0, nframes-len(signal)%nframes), 'constant', constant_values=0.0)
    # Reshape padded signal into nframes rows, each row is one frame
    frames = padded.reshape((nframes, len(padded)//nframes))
    return frames

npoints = 1152
nbands = 32
nsubbands = 18

# Divide original signal into frames, each frame consists of npoints
x_frames = divide_into_frames(x, npoints)

# For each frame, compute the fft
# x_frames_fft = fft(x_frames)

# # Compute the signal-to-masking ratio for each frame -- this will be use as a scale 
# # value when it come to quantizing the subbands
# x_smr = psychoacoustical_analyzer(x_frames_fft)

# # For each frame, compute its filter banks -- bands
# x_bands = filter_banks(x_frames, nbands)

# # For each filter bank in each frame, compute the subbands using MDCT
# x_subbands = divide_into_subbands(x_bands, nsubbands)

# # Quantize subbands using the SMRs gathered in the psychoacoustical analyzer
# x_quantized = quantize_subbands(x_subbands, x_smr)


In [5]:
def filter_banks(frames, nbands, fs, order=5):
    filter_denoms = np.zeros((nbands, order*2+1))
    filter_numers = np.zeros((nbands, order*2+1))
    
    freq_range = np.linspace(1, fs/2-1, nbands+1)
    
    x_bands = np.zeros((nbands, frames.shape[0], frames.shape[1]))
    
    for i in range(0, nbands):
        b, a = butter_bandpass(freq_range[i], freq_range[i+1], fs, order=5)
        filter_denoms[i] = b
        filter_numers[i] = a
        
    for i in range(0, frames.shape[0]):
        for j in range(0, nbands):
            x_bands[j,i,:] = lfilter(filter_denoms[j], filter_numers[j], frames[i])
    
    return x_bands

x_bands = filter_banks(x_frames, nbands, fs)
x_bands.shape

(32, 6084, 1152)

In [13]:
def divide_into_subbands(x_bands, nsubbands):
    total_subbands = x_bands.shape[0] * nsubbands
#     print(total_subbands)
    window_length = 128
    alpha_value = 5
    x_subbands = np.zeros((total_subbands, x_bands.shape[1], window_length//2))
#     x_subbands = np.zeros((x_bands.shape[1], total_subbands))
    
    # Kaiser-Bessel-derived (KBD) window as used in the AC-3 audio coding format

    window_function = np.kaiser(int(window_length/2)+1, alpha_value*np.pi)
    window_function2 = np.cumsum(window_function[0:int(window_length/2)])
    window_function = np.sqrt(np.concatenate((window_function2, window_function2[int(window_length/2)::-1]))
                              / np.sum(window_function))
    
    for i in range(0, x_bands.shape[1]):
        for j in range(0, x_bands.shape[0]):
            x_subbands[j*nsubbands:(j+1)*nsubbands,i,:] = ctt.mdct(x_bands[j,i,:],window_function).T
    
    return x_subbands

t1 = time.time()
x_subbands = divide_into_subbands(x_bands, nsubbands)
print(time.time() - t1)

44.4410502910614


In [12]:
x_subbands.shape

(576, 6084, 64)

In [8]:
def psychoacoustical_analyzer(fft_array):
    return 0

In [9]:
def powerlaw_quantizer(x):
    return sgn(x)*np.log(1 + 255*np.abs(x))/np.log(1 + 255)

def sgn(x):
    a = np.array(x, copy=True)
    a[a < 0] = -1.0
    a[a >= 0] = 1.0
    return np.asarray(a, dtype='float')

In [10]:
# def mdct4(x):
#     N = x.shape[0]
#     if N%4 != 0:
#         raise ValueError("MDCT4 only defined for vectors of length multiple of four.")
#     M = N // 2
#     N4 = N // 4
    
#     rot = np.roll(x, N4)
#     rot[:N4] = -rot[:N4]
#     t = np.arange(0, N4)
#     w = np.exp(-1j*2*np.pi*(t + 1./8.) / N)
#     c = np.take(rot,2*t) - np.take(rot, N-2*t-1) \
#         - 1j * (np.take(rot, M+2*t) - np.take(rot,M-2*t-1))
#     c = (2./np.sqrt(N)) * w * np.fft.fft(0.5 * c * w, N4)
#     y = np.zeros(M)
#     y[2*t] = np.real(c[t])
#     y[M-2*t-1] = -np.imag(c[t])
#     return y


# def imdct4(x):
#     N = x.shape[0]
#     if N%2 != 0:
#         raise ValueError("iMDCT4 only defined for even-length vectors.")
#     M = N // 2
#     N2 = N*2
    
#     t = np.arange(0,M)
#     w = np.exp(-1j*2*np.pi*(t + 1./8.) / N2)
#     c = np.take(x,2*t) + 1j * np.take(x,N-2*t-1)
#     c = 0.5 * w * c
#     c = np.fft.fft(c,M)
#     c = ((8 / np.sqrt(N2))*w)*c
    
#     rot = np.zeros(N2)
    
#     rot[2*t] = np.real(c[t])
#     rot[N+2*t] = np.imag(c[t])
    
#     t = np.arange(1,N2,2)
#     rot[t] = -rot[N2-t-1]
    
#     t = np.arange(0,3*M)
#     y = np.zeros(N2)
#     y[t] = rot[t+M]
#     t = np.arange(3*M,N2)
#     y[t] = -rot[t-3*M]
#     return y