In [1]:
import librosa
import numpy as np
import sys
import pickle
import time
import IPython.display as ipd
from codec import encode, decode
import scipy.signal as signal
from scipy.fftpack import fft, ifft, dct, idct
from scipy.signal import butter, lfilter
import matplotlib.pyplot as plt

In [2]:
%matplotlib 

Using matplotlib backend: TkAgg


In [3]:
def load_cd_quality_audio(filename):
    audio, sr = librosa.load(filename, sr=44100, dtype='float_')
    max_int_value = 2**15 - 1
    audio *= max_int_value
    audio = audio.astype('int16')
    return audio
    
audio_signal = load_cd_quality_audio("taxman.wav")
sample_rate = 44100

In [12]:
# audio_signal = audio_signal / (2.0 ** (audio_signal.itemsize * 8 - 1))
# audio_signal = np.mean(audio_signal, 1)

# MDCT with a slope function as used in the Vorbis audio coding format
window_length = 2048
window_function = np.sin(np.pi / 2
                         * np.power(np.sin(np.pi / window_length * np.arange(0.5, window_length + 0.5)), 2))
audio_mdct = mdct(audio_signal, window_function)

# Inverse MDCT and error signal
audio_signal2 = imdct(audio_mdct, window_function)

audio_signal2 = audio_signal2[0:len(audio_signal)]
print(audio_mdct.shape, audio_signal2.shape)
error_signal = audio_signal - audio_signal2
print(error_signal.shape)

(1024, 6845) (7008248,)
(7008248,)


In [None]:
# Original, resynthesized, and error signals
plt.figure(figsize=(20,10))
plt.rc('font', size=10)
plt.subplot(3, 1, 1), plt.plot(audio_signal), plt.autoscale(tight=True), plt.title("Original Signal")
plt.xticks(np.arange(sample_rate, len(audio_signal), sample_rate),
           np.arange(1, int(np.floor(len(audio_signal) / sample_rate)) + 1))
plt.xlabel('Time (s)')
plt.rc('font', size=10)
plt.subplot(3, 1, 2), plt.plot(audio_signal2), plt.autoscale(tight=True), plt.title("Resynthesized Signal")
plt.xticks(np.arange(sample_rate, len(audio_signal), sample_rate),
           np.arange(1, int(np.floor(len(audio_signal) / sample_rate)) + 1))
plt.xlabel('Time (s)')
plt.rc('font', size=10)
plt.subplot(3, 1, 3), plt.plot(error_signal), plt.autoscale(tight=True), plt.title("Error Signal")
plt.xticks(np.arange(sample_rate, len(audio_signal), sample_rate),
           np.arange(1, int(np.floor(len(audio_signal) / sample_rate)) + 1))
plt.xlabel('Time (s)')
plt.rc('font', size=10)
plt.show()

In [9]:
def mdct(audio_signal, window_function):
    number_samples = len(audio_signal)
    window_length = len(window_function)

    number_times = int(np.ceil(2 * number_samples / window_length) + 1)
#     if number_samples == window_length:
#         number_times = 1
#     else:
#         number_times = int(np.ceil(2 * number_samples / window_length)+1)
    audio_signal = np.pad(audio_signal,
                          (int(window_length / 2), int((number_times + 1) * window_length / 2 - number_samples)),
                          'constant', constant_values=0)
#     pre_pad = (window_length*3/2 - number_samples%window_length) // 2;
#     post_pad = (window_length*3/2 - number_samples%window_length) - pre_pad;
#     print(window_length, number_times, pre_pad, post_pad)
#     audio_signal = np.pad(audio_signal, (int(pre_pad), int(post_pad)), 'constant', constant_values=0)

    # Initialize the MDCT
    audio_mdct = np.zeros((int(window_length / 2), number_times))

    # Pre and post-processing arrays
    preprocessing_array = np.exp(-1j * np.pi / window_length * np.arange(0, window_length))
    postprocessing_array = np.exp(-1j * np.pi / window_length * (window_length / 2 + 1)
                                  * np.arange(0.5, window_length / 2 + 0.5))

    # Loop over the time frames
    for time_index in range(0, number_times):

        # Window the signal
        sample_index = time_index * int(window_length / 2)
        audio_segment = audio_signal[sample_index:sample_index + window_length] * window_function

        # FFT of the audio segment after pre-processing
        audio_segment = np.fft.fft(audio_segment * preprocessing_array)

        # Truncate to the first half before post-processing
        audio_mdct[:, time_index] = np.real(audio_segment[0:int(window_length / 2)] * postprocessing_array)

    return audio_mdct

In [8]:
def imdct(audio_mdct, window_function):
    # Number of frequency channels and time frames
    number_frequencies, number_times = np.shape(audio_mdct)

    # Number of samples for the signal
    number_samples = number_frequencies * (number_times + 1)

    # Initialize the audio signal
    audio_signal = np.zeros(number_samples)

    # Pre and post-processing arrays
    preprocessing_array = np.exp(-1j * np.pi / (2 * number_frequencies)
                                 * (number_frequencies + 1) * np.arange(0, number_frequencies))
    postprocessing_array = np.exp(-1j * np.pi / (2 * number_frequencies)
                                  * np.arange(0.5 + number_frequencies / 2,
                                              2 * number_frequencies + number_frequencies / 2 + 0.5)) \
        / number_frequencies

    # FFT of the frames after pre-processing
    audio_mdct = np.fft.fft(audio_mdct.T * preprocessing_array, n=2 * number_frequencies, axis=1)

    # Apply the window to the frames after post-processing
    audio_mdct = 2 * (np.real(audio_mdct * postprocessing_array) * window_function).T

    # Loop over the time frames
    for time_index in range(0, number_times):

        # Recover the signal thanks to the time-domain aliasing cancellation (TDAC) principle
        sample_index = time_index * number_frequencies
        audio_signal[sample_index:sample_index + 2 * number_frequencies] \
            = audio_signal[sample_index:sample_index + 2 * number_frequencies] + audio_mdct[:, time_index]

    # Remove the pre and post zero-padding
    audio_signal = audio_signal[number_frequencies:-number_frequencies - 1]

    return audio_signal