In [None]:
using DSP;
using PyPlot;
import Statistics: mean;
using Dierckx;
using FFTW;
using WAV;

This notebook covers rendering code for:
- tones with time-varying frequency and amplitude
- noises with time-varying spectra

Along the way, it introduces gammatonegrams for visualization, the necessary signal processing utilities, and finally specific stimulus generation code. Both renderers are based on an excitation-filter split.

### Gammatonegram: time-frequency representation of sound
Julia implementation based on https://labrosa.ee.columbia.edu/matlab/gammatonegram/

In [None]:
function fft2gammatonemx(nfft, sr, nfilts, width, minfreq, maxfreq, maxlen)
    #=Generates a matrix of weights to combine FFT binds into Gammatone bins

    Input
    -----
    sr (samples/sec): sampling rate
    nfft: source fft size at sampling rate sr
    nfilts: number of output bands
    width (in Bark): constant width of each band
    minfreq, maxfreq (Hz): range covered
    maxlen: truncates number of bins to audible spectrum

    Returns
    -------
    wts: weight matrix
    cfreqs: actual centre frequencies of gammatone band in Hz
    =#

    #Constants, after Slaney's MakeERBFilters
    EarQ = 9.26449
    minBW = 24.7
    T = 1/sr
    nFr = 1:nfilts
    em = EarQ*minBW
    cfreqs = (maxfreq + em) * exp.(((-1*log(maxfreq + em) + log(minfreq + em))/nfilts)*nFr).-em
    cfreqs = cfreqs[end:-1:1]

    ERB = width*( cfreqs/EarQ .+ minBW )
    B = 1.019 * 2 * pi * ERB
    r = exp.(-B*T)
    cre = 2 * pi * cfreqs * T; cim = im*cre;

    ebt = exp.(B*T)
    ccpt = 2*T*cos.(cre); scpt = 2*T*sin.(cre)
    xm = sqrt(3 - 2^1.5); xp = sqrt(3 + 2^1.5)
    A(f::Function, x) = transpose(sr*0.5*( f.(ccpt./ebt, x*scpt./ebt) ))
    As = vcat(A(+, xp), A(-, xp), A(+, xm), A(-, xm))

    G(f::Function, x) = (-2*T*exp.(2*cim) .+ 2*T*exp.(-B.*T .+ cim).*( f.(cos.(cre), x*sin.(cre))))
    G_denom = ( -2*exp.(-2*B*T) - 2*exp.(2*cim) + 2*r.*(1 .+ exp.(2*cim)) ).^4
    gain = abs.(G(-, xm).*G(+, xm).*G(-, xp).*G(+, xp)./G_denom)

    uarr = (1:(nfft/2 + 1)) .- 1
    ucirc = repeat( transpose(exp.(2*im*pi*uarr/nfft)), outer=[64,1])
    pole = repeat(r .* exp.(cim), outer=[1,size(ucirc)[2]])

    U(n) = abs.(ucirc .- repeat(As[n,:], outer=[1,size(ucirc)[2]]))
    Utg = (T^4)./repeat(gain, outer=[1, size(ucirc)[2]])
    Upole = abs.(((pole .- ucirc).*( conj.(pole) .- ucirc)).^(-4))

    wts = Utg.*U(1).*U(2).*U(3).*U(4).*Upole
    wts = wts[:,1:maxlen]

    return wts, cfreqs

end

function gammatonegram(x, sr)
    #=Compute a fast cochleagram by weighting spectrogram channels with gammatone filters

    Input
    -----
    x (Int16): audio signal
    sr (Hz): audio sampling rate
    
    Parameters (see function body)
    ----------
    twin (sec): duration of windows for integration
    thop (sec): duration between successive integration windows
    nfilts: number of channels in gammatone filterbank
    fmin (Hz), fmax (Hz): range of frequencies covered
    width (relative to ERB default): how to scale bandwidths of filters
    log_constant: in order to take log of gammatonegram, which will have zero vals
    dB_threshold: minimum value in gammatonegram 
    
    
    Returns
    -------
    gammatonegram (in dB)
    vector of timepoints corresponding to gammatonegram bins
    center frequencies corresponding to gammatonegram bins (uniformly spaced on Bark scale)

    =#

    #Set parameters    
    fmax = sr/2;
    twin=0.025; thop=0.010; nfilts=64; fmin=50; width=1.0;
    log_constant = 1e-10; dB_threshold=0;
    
    #Define spectrogram settings
    nfft = DSP.Util.nextfastfft( Int(floor(2^(ceil(log(2*twin*sr)/log(2))))) )
    nhop = Int(round(thop*sr))
    nwin = Int(round(twin*sr))

    #Compute weights for channels of spectrogram
    wts, cfreqs = fft2gammatonemx(nfft, sr, nfilts, width, fmin, fmax, Int(floor(nfft/2+1)))
    # I think this spectrogram might not be quite right
    sg = DSP.Periodograms.spectrogram(x, nwin, nwin-nhop, nfft=nfft, fs=sr, window=hanning(nwin) )
    gtg = (1/nfft)*(wts*power(sg)) #apply weights

    ##Convert gammatonegram to dB
    gtg = max.(gtg, log_constant)
    gtg = 20*log.(10, gtg)
    gtg = max.(gtg, dB_threshold)

    return gtg, time(sg), cfreqs

end

function plot_gtg(gtg, t, f)
    #Simple gammatonegram plotting function that takes in 'gammatonegram' outputs
    img = imshow(gtg, cmap="Blues", origin="lower", extent=(0, t[end], 0, f[end]), aspect="auto")
    colorbar(img)
end

### General signal utilities

In [None]:
#Frequency scale conversions
function freq_to_ERB(freq)
    return 9.265*log.(1 .+ freq./(24.7*9.265))
end

function ERB_to_freq(ERB)
    return 24.7*9.265*(exp.(ERB./9.265) .- 1)
end

In [None]:
function hann_ramp(x, sr)
    #= Applies a hann window to a soundwave s so s has gradual onsets & offsets
    x: soundwave
    sr (samples/sec)
    
    Parameters
    ----------
    ramp_duration (sec): duration of onset = duration of offset
    
    =#

    #Set parameters
    ramp_duration = 0.010
    
    #Make ramps
    t = 0:1/sr:ramp_duration
    n_samples = size(t)[1]
    off_ramp = 0.5*(ones(n_samples) + cos.( (pi/ramp_duration)*t ))
    on_ramp = off_ramp[end:-1:1]

    #Apply ramps
    x[1:n_samples] .*= on_ramp
    x[end-n_samples+1:end] .*= off_ramp

    return x

end

Creates windows in order to segment and differentially amplify the excitation signal:

In [None]:
function time_win_shape(x)
    return (1 .+ cos.(x))./2
end

function make_overlapping_time_windows(filt_length, tstep, sr)
    #=Computes array of linearly spaced cos-shaped windows 
    
    Input
    -----
    filt_length: length of time dimension of filter array
    tstep (s): duration of one filter element 
    sr (Hz): audio sampling rate
    
    Output
    ------
    win_arr (n_samples_in_scene, n_win): 
        overlapping cos-shaped windows for amplitude modulation in time
    
    =#
    
    win_len = 2*(floor(tstep*sr))
    sig_len = (filt_length - 1)*(win_len/2)
    
    # Check for erroneous inputs
    if (sig_len % (win_len/2) != 0) || (win_len % 2 != 0)
        error("win_len must be even and sig_len must be divisible by half of win_len")
    end
        
    n_win = Int(floor(2*sig_len/win_len + 1))
    win_adv_step = Int(floor(win_len/2))
    last_win_start_idx = Int(sig_len - floor(win_len/2))
    
    window = time_win_shape(range(-pi, stop=pi, length=Int(win_len)))
    win_arr = zeros(Int(sig_len), n_win)
    win_idx_range = range(1, stop=last_win_start_idx, step=win_adv_step)
    for (curr_win_idx, curr_start_idx) in enumerate(win_idx_range)
        curr_lo_idx = curr_start_idx
        curr_hi_idx = curr_start_idx + Int(win_len) - 1
        win_arr[curr_lo_idx:curr_hi_idx, curr_win_idx + 1] = window
    end
    #Half windows at the edges
    n = Int(floor(win_len/2))
    win_arr[1:n+1, 1] = window[end-n:end]
    win_arr[end-n:end,end] = window[1:n+1]
    
    return win_arr
    
end

function freq_win_shape(x)
    return cos.(x ./ 2.0)
end

function make_overlapping_freq_windows(filt_length, lowf, highf, estep, tstep, sr)
    #=Computes array of log spaced cos-shaped windows 
    
    Input
    -----
    filt_length: length of time dimension of filter array
    lowf (Hz), highf (Hz), estep (ERB): defines cutoffs, widths of frequency channels
    tstep (s): duration of one filter element 
    sr (Hz): audio sampling rate
    
    Output
    ------
    win_arr (n_samples_in_scene, n_channels): 
        overlapping cos-shaped windows for amplitude modulation in frequency
    
    =#
    
    n_freqs = Int((filt_length - 1)*floor(tstep*sr))
    max_freq = sr/2; 
    loERB = freq_to_ERB(lowf)
    hiERB = freq_to_ERB(minimum([max_freq, highf]))
    n_channels = Int(length(range(loERB, stop=hiERB, step=estep)) + 2);
    freqs = range(0, stop=max_freq, length=n_freqs)    

    ERB_cutoffs_1D = range(loERB, stop=hiERB, length=n_channels)
    ERB_cutoffs = [[ERB_cutoffs_1D[i], ERB_cutoffs_1D[i+2]] for i in range(1,stop=n_channels - 2)]
    freq_cutoffs = [ERB_to_freq(erbs) for erbs in ERB_cutoffs]
    
    win_arr = zeros(n_freqs, n_channels)
    for curr_channel = range(1,stop=n_channels - 2)
        
        curr_lo_freq = freq_cutoffs[curr_channel][1]
        curr_hi_freq = freq_cutoffs[curr_channel][2]
        curr_lo_freq_idx = argmax(freqs .> curr_lo_freq)
        curr_hi_freq_idx = argmin(freqs .< curr_hi_freq)
        
        curr_lo_ERB = ERB_cutoffs[curr_channel][1]
        curr_hi_ERB = ERB_cutoffs[curr_channel][2]
        curr_mean_ERB = (curr_hi_ERB + curr_lo_ERB)/2
        ERB_bandwidth = curr_hi_ERB - curr_lo_ERB
        
        curr_ERBs = freq_to_ERB(freqs[curr_lo_freq_idx:curr_hi_freq_idx-1])
        normalized_domain = 2*(curr_ERBs .- curr_mean_ERB)/ERB_bandwidth
        curr_win = freq_win_shape(pi.*normalized_domain)
        win_arr[curr_lo_freq_idx:curr_hi_freq_idx-1, curr_channel + 1] = curr_win
        
    end
    
    return win_arr[:,2:end-1]
    
end

Generating and manipulating frequency subbands for creating time-varying spectra:

In [None]:
function generate_subbands(x, filterbank)
    #=Split a sound into several frequency subbands
    
    Input
    -----
    x: 1D signal
    filterbank (length(x), n_freq_channels): log-spaced cosine filters 
    =#
    
    sig_len = length(x)
    filt_len, n_channels = size(filterbank)
    if sig_len != filt_len
        error("Signal length must equal filter length.")
    end
    
    #fft_len = DSP.Util.nextfastfft(sig_len)
    fft_sig = FFTW.dct(x)
    filtered_subbands = filterbank .* repeat(fft_sig, 1, n_channels)
    subbands = FFTW.idct(filtered_subbands, 1)
    return subbands 
    
end

function collapse_subbands(subbands, filterbank)
    #=Combine subbands into a single sound
    
    Input
    -----
    subbands (length(x), n_channels): subbands from "generate_subbands"
    filterbank (length(x), n_channels): log-spaced cosine filters 
    
    =#
    
    sig_len = size(subbands)[1]
    filt_len = size(filterbank)[1]
    if sig_len != filt_len
        error("Signal length must equal filter length.")
    end
    #fft_len = DSP.Util.nextfastfft(sig_len)
    fft_subbands = FFTW.dct(subbands, 1)
    filtered_subbands = filterbank.*fft_subbands
    mod_subbands = FFTW.idct(filtered_subbands, 1)
    return sum(mod_subbands, dims=2)
    
end

function modulate_subbands(subbands, win_arr, energy_grid)
    #=Change the amplitude of subbands in each time element & frequency subband    
    
    Input
    -----
    subbands (length(x), n_channels): subbands from "generate_subbands"
    win_arr (length(x), n_windows): linearly-spaced cosine filters in time
    energy_grid (n_channels, n_windows): log-spaced cosine filters in frequency
    
    Returns
    -------
    modified subbands (length(x), n_channels)
    
    =#
    
    clen, n_channels = size(subbands)
    wlen, n_windows = size(win_arr)
    if clen == wlen
        sig_len = clen
    else
        error("Incorrect array lengths.")
    end
    divisor = fill(2*floor(tstep*sr), (1,n_channels,n_windows))
    divisor[:, :, [1, end]] /= 2; 
    energy_grid = reshape(energy_grid, 1, n_channels, n_windows)
    
    tsw =  reshape(subbands, sig_len, n_channels, 1) .* reshape(win_arr, sig_len, 1, n_windows)
    scalars = sum(tsw.^2, dims=1)
    sqscalars = (scalars./divisor).^(0.5)
    mod_grid = reshape(energy_grid ./ sqscalars, 1, n_channels, n_windows)
    mod_subbands = tsw .* mod_grid
    
    return sum(mod_subbands, dims=3)[:, :, 1]
    
end

### Noisy and tonal excitation

In [None]:
function white_excitation(duration, sr)
    #= Generate bandpass white noise 
    Input
    -----
    duration (s)
    sr (sampling rate, Hz)
    
    Returns
    -------
    1D vector of constant amplitude white noise
    =#
    
    nyq_freq = Int(floor(sr/2))
    lo_lim_freq = 20; hi_lim_freq = nyq_freq-1
    sig_len = Int(floor(sr*duration))
    #fft_len = DSP.Util.nextfastfft(sig_len)
    lo_idx = Int(ceil(lo_lim_freq/(1. * nyq_freq)*sig_len))
    hi_idx = Int(floor(hi_lim_freq/(1. * nyq_freq)*sig_len))

    noise_spec = zeros(sig_len)
    noise_spec[lo_idx:hi_idx-1] = randn((hi_idx - lo_idx,))
    source = FFTW.idct(noise_spec)
    source /= (mean(source.^2)).^0.5
    println("size of excitation", size(source))
    return source

end

function FM_excitation(erbf0, duration, tstep, sr)
    #= Generate a frequency modulated tone 
    
    Input
    -----
    erbf0 (vector of ERB elements): specification of frequency in each window
    duration (s): overall duration of tone
    tstep (s): duration of one time window in erbf0 vector
    sr (Hz): audio sampling rate
    
    Returns
    -------
    1D vector of frequency modulated tone
    
    =#
    
    timepoints = 0:1/sr:duration
    erbSpl = Spline1D(tstep.*(0:length(erbf0)-1), erbf0, k=1)
    f0 = ERB_to_freq(erbSpl(timepoints))
    if any(f0 .< 20) || any(f0 .> sr/2. - 1)
        f0[f0 < 20] = 20
        f0[f0 > sr/2. - 1] = sr/2. - 1
    end
    
    source = sin.(2*pi*cumsum(f0*(1/sr)))
    source /= (mean(source.^2)).^0.5
    
    return source
end

## Tone generation

In [None]:
function generate_tone(erbf0, filt, duration, tstep, sr)
    #= Generate an amplitude modulated, frequency modulated tone
    
    Input
    -----
    erbf0: 1D vector of ERB values 
    filt: 1D vector of decibel values
    duration (s): overall duration of sound
    tstep (s): duration of one element of erbf0 or filt
    ---> Right now, erbf0 and filt have the same tstep, 
         but it could be different depending on what we 
         want the GP sampling rates to be. 
    sr (Hz): audio sampling rate
    
    Returns
    -------
    1D vector of am/fm tone with ramps

    =#
    
    #Set parameters 
    cell_mean = 3; log_const = 1e-8;
    
    #Create excitation
    FM_tone = FM_excitation(erbf0, duration, tstep, sr)
    
    #Create filter
    win = make_overlapping_time_windows(length(erbf0), tstep, sr)
    energy_grid = 10 .^(filt .+ cell_mean) .- log_const
    #Apply filter
    A = win .* reshape(energy_grid, 1, length(energy_grid)) 
    AM_FM_tone = sum(FM_tone .* A,dims=2)
    
    tone = hann_ramp(AM_FM_tone[:,1],sr)
    
    return tone

end

In [None]:
#Temporal settings 
sr=20000;
onset = 0.1
offset = 0.3
duration = offset-onset;
tstep = 0.025;
nw = length(range(onset, stop=offset, step=tstep))
#Tone frequencies and amplitudes
erbf0 = range(10.0,stop=16.0,length=nw);
filt = range(-0.3,stop=0.3,length=nw)
#Generate waveform
tone_segment = generate_tone(erbf0, filt, duration, tstep, sr)
plot(0:1/sr:duration,tone_segment)

In [None]:
(gtg, t, f) = gammatonegram(tone_segment, sr)
plot_gtg(gtg, t, f)

### Listen!

In [None]:
wavplay(tone_segment/maximum(tone_segment),sr)

## Noise generation
We need to sequence the subband operations for amplitude modulation of noise:

In [None]:
function modulate_noise(subbands, corr_grid, win_arr, filterbank)
    #=modulate noise with gp-sampled amplitudes for each (t,f) bin
    
    Input
    -----
    subbands (length(x), n_channels):
    corr_grid (n_windows, n_channels):
    win_arr (length(x), n_windows):
    filterbank (length(x), n_channels):
    
    Returns
    -------
    1D vector of amplitude modulated noise

    =#

    log_const = 1e-8; colour = "pink";
    n_channels = size(subbands)[2]
    n_windows = size(win_arr)[2]

    if colour == "pink"
        mean_grid = reshape(range(0, stop=-0.1, length=n_channels), 1, n_channels)
    elseif colour == "white"
        mean_grid = zeros(1, n_channels)
    end
    energy_grid = transpose(10 .^(corr_grid .+ mean_grid) .- log_const)
    mod_subbands = modulate_subbands(subbands, win_arr, energy_grid)
    noise = collapse_subbands(mod_subbands, filterbank)

    return noise[:,1]
    
end

function generate_noise(duration, filt, lowf, highf, estep, tstep,  sr)
    #=Generate time-varying noise
    
    Input
    -----
    duration (s): overall duration of noise segment
    filt (n_windows, n_channels): 2D GP sampled amplitudes
    lowf (Hz), highf (Hz), estep (ERB): defines cutoffs, widths of frequency channels
    tstep (s): duration of one filter element 
    sr (Hz): audio sampling rate
    
    Returns
    -------
    1D vector of amplitude modulated noise
    
    =#
        
    #Time & freq windows for amplitude modulation
    win_arr = make_overlapping_time_windows(size(filt)[1], tstep, sr) 
    filterbank = make_overlapping_freq_windows(size(filt)[1], lowf, highf, estep, tstep, sr)
    
    #Source & subbands
    source = white_excitation(duration, sr)
    subbands = generate_subbands(source, filterbank)

    #Amplitude modulate noise 
    AM_noise = modulate_noise(subbands, filt, win_arr, filterbank)
    #Apply ramps
    noise = hann_ramp(AM_noise[:,1],sr)
    
    return Int.(round.(noise*32767))
    
end

In [None]:
#Temporal settings
sr = 20000;
onset = 0.1;
offset = 0.9;
duration = offset - onset;
tstep = 0.025;
#Frequency settings
lowf = 20; lowe = freq_to_ERB(lowf);
highf = 9999; highe = freq_to_ERB(highf);
estep = 2;
#Spectrum specification over time
tpoints = length(range(onset, stop=offset, step=tstep));
fpoints = length(range(lowe, stop=highe, step=estep));
grid_type = "increasing"
if grid_type == "constant"
    filt = zeros(tpoints,fpoints);
elseif grid_type == "increasing"
    filt=repeat(range(-1,stop=1,length=tpoints), 1, fpoints);
end

#Generate and plot noise
noise_segment = generate_noise(duration, filt, lowf, highf, estep, tstep,  sr);
plot(0:1/sr:duration-1/sr,noise_segment);

In [None]:
(gtg, t, f) = gammatonegram(noise_segment, sr);
plot_gtg(gtg, t, f);

### Listen!

In [None]:
wavplay(noise_segment/maximum(noise_segment),sr)