# Vocoders 


Vocoders (Voice coders) originated from the early days of speech signal processing. The main idea is to break the spectrum into subbands and then analyze each subband to extract information about it (in early vocoders just 
the energy in each subband). For each short frame of audio (typically 10-50 milliseconds) one calculates the gain for each subband, plus a pitch/noise + power for an artificial source that is either noise to encode consonants or a pitched oscillator (to encode vowels). So if we have something like 40 frames per second and each frame consists of 8 subband gains and a pitch estimate + noise power level we have something like 400 numbers per second instead of 8000 or 16000 numbers per second which are common rates for speech. The resynthesized speech sounds artificial and robotic but is still intelligible. 

In the 1970s-80s musicians started using cross-synthesis vocoders that replaced the synthetic parametric source with an rich spectral signal - for example one can feed speech to the vocoder analysis filterbank and a distorted electric guitar into the sysnthesis bank audio inputs. 

One can view a discrete fourier transform (DFT) as a way to perform a filterbank decomposition of the audio signal using very narrow bandwidth filters. The magnitudes and phases of the corresponding sinusoidal basis functions  
can be used to create an additive model of the sound. 

By careful use of windowing, overlap-add, and maintaining phase information one can achieve good sounding modifications such as pitch shifting and time scaling. 






In [None]:
import IPython.display as ipd 
import sys
import pyaudio
import numpy as np
import scipy
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show

output_notebook()

plot_library = 'matplotlib'
#plot_library = 'matplotlib_xkcd'
#plot_library = 'bokeh'


if (plot_library=='bokeh'):
    import bokeh 
    from bokeh.io import output_notebook
    from bokeh.plotting import figure, output_file, show
    output_notebook()

    def plot(data_list): 
        p = figure(plot_height=300, plot_width=600, title='Synthesizers')
        for data in data_list: 
            p.line(np.arange(0,len(data)), data)
        show(p)
        
if (plot_library=='matplotlib'): 
    %matplotlib notebook 
    import matplotlib.pyplot as plt
    def plot(data_list,label_list=[],xlabel='', ylabel='', title=''):
        fig, ax = plt.subplots(figsize=(8,4))
        for (data,label) in zip(data_list, label_list): 
            plt.title('Synth-CS: '+title)
            plt.xlabel(xlabel)
            plt.ylabel(ylabel)
            plt.plot(np.arange(0, len(data)), data, label=label)
        if (label_list):
            ax.legend()
        plt.ion()
        plt.show()
        
if (plot_library=='matplotlib_xkcd'): 
    %matplotlib notebook 
    import matplotlib.pyplot as plt

    def plot(data_list,label_list=[],xlabel='', ylabel='', title=''):
        fig, ax = plt.subplots(figsize=(5,3))  
        plt.xkcd()
        if not(label_list):
            for (i,d) in enumerate(data_list): 
                label_list.append(str(i))
        for (data,label) in zip(data_list, label_list): 
            plt.title('Synth-CS: '+title)
            plt.xlabel(xlabel)
            plt.ylabel(ylabel)
            plt.plot(np.arange(0, len(data)), data, label=label)  
            ax.legend()
        plt.ion()
        plt.show()

In [None]:
import librosa
import IPython.display as ipd

In [None]:
y, sr = librosa.load("lpc_example.wav")
ipd.Audio(y, rate=sr)

In [None]:


def overlap_add_spectral(input, win_size, hop_size, mode="full"): 
    # precompute the hanning window
    window = scipy.signal.hann(win_size)
    output= np.zeros(len(input)+win_size)   
    phases = np.zeros(win_size)
    mod_spectrum = np.zeros(win_size, dtype=complex)

    for i in range(0,len(input)-win_size,hop_size):
        # slice the audio into overlapping frames 
        frame = input[i:i+win_size]
        # calculate the complex spectrum      
        complex_spectrum =  np.fft.fft(window*frame)
        
        # convert to magnitude and phase 
        magnitudes = np.abs(complex_spectrum)
        if (mode=="full"): 
            phases = np.angle(complex_spectrum)
        elif  (mode=='random'):
            phases = np.random.uniform(-np.pi, np.pi, size=win_size)

        else: 
            phases = np.ones(win_size) * np.pi
       
        # back to real and imaginare 
        mod_spectrum.real, mod_spectrum.imag = (np.cos(phases), 
                                                np.sin(phases)) * magnitudes
        
        
        # go back to the time domain from the complex spectrum 
        reconstructed_frame = np.fft.ifft(mod_spectrum).real
        
        if (i == 10 * hop_size): 
            plot([window *frame, reconstructed_frame], ['frame', 'reco'])
        
        if len(frame)==win_size:
            output[i:i+win_size] += reconstructed_frame        
    return 0.5 * output    

output = overlap_add_spectral(y, 2048, 512, "random")
ipd.Audio(output, rate=sr, normalize=False)

In [None]:
output = overlap_add_spectral(y, 2048, 512, "zeros")
ipd.Audio(output, rate=sr, normalize=False)

In [None]:
output = overlap_add_spectral(y, 2048, 512, "random")
ipd.Audio(output, rate=sr, normalize=False)

In [None]:
def overlap_add_time_stretch(input, win_size, hop_size, mode="full", time_scale = 2.0): 
    # precompute the hanning window
    window = scipy.signal.hann(win_size)
    output= np.zeros(int(time_scale * len(input)+win_size))   
    phases = np.zeros(win_size)
    mod_spectrum = np.zeros(win_size, dtype=complex)
    ana_hop_size = int(hop_size/time_scale)
    syn_hop_size = hop_size 
    frame = np.zeros(win_size)
    
    k = 0 
    for i in range(0,len(input)-win_size,ana_hop_size):
        # slice the audio into overlapping frames 
        frame = input[i:i+win_size]
        # calculate the complex spectrum      
        complex_spectrum =  np.fft.fft(window*frame)
        
        # convert to magnitude and phase 
        magnitudes = np.abs(complex_spectrum)
        if (mode=="full"): 
            phases = np.angle(complex_spectrum)
        elif  (mode=='random'):
            phases = np.random.uniform(-np.pi, np.pi, size=win_size)

        else: 
            phases = np.zeros(win_size)
       
        # back to real and imaginare 
        mod_spectrum.real, mod_spectrum.imag = (np.cos(phases), np.sin(phases)) * magnitudes
        
        # go back to the time domain from the complex spectrum 
        reconstructed_frame = np.fft.ifft(mod_spectrum).real
        
        
        if len(output[k:k+win_size])==win_size:
             output[k:k+win_size] += reconstructed_frame 
        k += syn_hop_size 
            
    return 0.5 * output    

output = overlap_add_time_stretch(y, 2048, 512, "zero", time_scale=0.5)
ipd.Audio(output, rate=sr, normalize=False)

In [None]:
def phasevocoder(input, win_size, hop_size, time_scale = 2.0, mode="delta"): 
    # precompute the hanning window
    window = scipy.signal.hann(win_size)
    output= np.zeros(int(time_scale * len(input)+win_size))   
    phases = np.zeros(win_size)
    accum_phases = np.zeros(win_size)
    mod_spectrum = np.zeros(win_size, dtype=complex)
    ana_hop_size = int(hop_size/time_scale)
    syn_hop_size = hop_size 
    frame = np.zeros(win_size)
    phase_differences = np.zeros(win_size)
    prev_complex_spectrum = np.zeros(win_size, dtype=complex)
    k = 0 
    prev_phases = np.zeros(win_size)
    
    for i in range(0,len(input)-2*win_size,ana_hop_size):
        # slice the audio into overlapping frames 
        frame = input[i:i+win_size]        
        # calculate the complex spectrum      
        complex_spectrum =  np.fft.fft(window*frame)
        magnitudes = np.abs(complex_spectrum)
        phases = np.angle(complex_spectrum)
        delta_phases = phases - prev_phases
        prev_phases = np.copy(phases)
        delta_phases = np.unwrap(delta_phases)
        
        if (mode == 'delta'): 
            # take their phase difference and integrate
            # bring the phase back to between pi and -pi   (phase unwarping)
            accum_phases += delta_phases 
        elif mode == 'original': 
            phases = np.angle(complex_spectrum)
        elif mode == 'random': 
            phases = np.random.uniform(-np.pi, np.pi, size=win_size)

        # back to real and imaginary 
        mod_spectrum.real, mod_spectrum.imag = (np.cos(accum_phases), np.sin(accum_phases)) * magnitudes
        
        # go back to the time domain from the complex spectrum 
        reconstructed_frame = np.fft.ifft(mod_spectrum).real
        
        if len(output[k:k+win_size])==win_size:
             output[k:k+win_size] += reconstructed_frame 
                
        prev_complex_spectrum = np.copy(complex_spectrum)
        k += syn_hop_size 
            
    return 0.5 * output    


#pitch_ratio = 5.0/4.0 
pitch_ratio = 4.0/5.0

output = phasevocoder(y, 2048, 512, time_scale=1.0/pitch_ratio, mode='delta')
ipd.Audio(output, rate=sr, normalize=True)

In [None]:
import resampy
pitch_shifted = resampy.resample(output,sr, sr * pitch_ratio)
ipd.Audio(pitch_shifted, rate=sr, normalize=True)


In [None]:
# just time stretch 
output = phasevocoder(y, 2048, 512, time_scale=1.25 )
ipd.Audio(output, rate=sr, normalize=True)

In [None]:
# pitch shift with resampling 

In [None]:
import resampy
pitch_shifted = resampy.resample(y,sr, sr * pitch_ratio)
ipd.Audio(pitch_shifted, rate=sr, normalize=True)

In [None]:
y, sr = librosa.load("disco.00000.wav")
ipd.Audio(y, rate=sr)

In [None]:
pitch_ratio = 5.0/4.0
output = phasevocoder(y, 2048, 512, time_scale=1.0/pitch_ratio)
ipd.Audio(output, rate=sr, normalize=True)

In [None]:
import resampy
pitch_shifted = resampy.resample(output,sr, sr * pitch_ratio)
ipd.Audio(pitch_shifted, rate=sr, normalize=True)

In [None]:
def phasevocoder(input, win_size, hop_size, time_scale = 2.0, mode="delta", num_sinusoids=100): 
    # precompute the hanning window
    window = scipy.signal.hann(win_size)
    output= np.zeros(int(time_scale * len(input)+win_size))   
    phases = np.zeros(win_size)
    accum_phases = np.zeros(win_size)
    mod_spectrum = np.zeros(win_size, dtype=complex)
    ana_hop_size = int(hop_size/time_scale)
    syn_hop_size = hop_size 
    frame = np.zeros(win_size)
    phase_differences = np.zeros(win_size)
    prev_complex_spectrum = np.zeros(win_size, dtype=complex)
    k = 0 
    prev_phases = np.zeros(win_size)
    
    for i in range(0,len(input)-2*win_size,ana_hop_size):
        # slice the audio into overlapping frames 
        frame = input[i:i+win_size]        
        # calculate the complex spectrum      
        complex_spectrum =  np.fft.fft(window*frame)
        magnitudes = np.abs(complex_spectrum)
        phases = np.angle(complex_spectrum)
        delta_phases = phases - prev_phases
        prev_phases = np.copy(phases)
        delta_phases = np.unwrap(delta_phases)

        
        mag_threshold = np.sort(magnitudes)[-num_sinusoids]
        low_value_bins = magnitudes < mag_threshold  # Where values are low
        #print('low value bins', low_value_bins)
        magnitudes[low_value_bins] = 0 # set to zero 
        
        if (mode == 'delta'): 
            # take their phase difference and integrate
            # bring the phase back to between pi and -pi   (phase unwarping)
            accum_phases += delta_phases 
        elif mode == 'original': 
            phases = np.angle(complex_spectrum)
        elif mode == 'random': 
            phases = np.random.uniform(-np.pi, np.pi, size=win_size)

        # back to real and imaginary 
        mod_spectrum.real, mod_spectrum.imag = (np.cos(accum_phases), np.sin(accum_phases)) * magnitudes
        
        # go back to the time domain from the complex spectrum 
        reconstructed_frame = np.fft.ifft(mod_spectrum).real
        
        if len(output[k:k+win_size])==win_size:
             output[k:k+win_size] += reconstructed_frame 
                
        prev_complex_spectrum = np.copy(complex_spectrum)
        k += syn_hop_size 
            
    return 0.5 * output    


pitch_ratio = 2.0/4.0 
#pitch_ratio = 4.0/5.0

output = phasevocoder(y, 2048, 512, time_scale=1.0/pitch_ratio, mode='delta', num_sinusoids=2000)
ipd.Audio(output, rate=sr, normalize=True)