In [10]:
import librosa
import numpy as np

In [11]:
SAMPLE_RATE = 44100
N_FFT = 1024
WIN_LENGTH = N_FFT
HOP_LENGTH = int(WIN_LENGTH/4)

In [12]:
def time_to_stft_frame(time):
    # First calculate time to sample:
    # sample_ix = time * config.SAMPLE_RATE
    # Then use that to calculate the column, by dividing by the hop_length
    # stft_col = sample_ix / config.HOP_LENGTH
    # return int((time * config.SAMPLE_RATE)/config.HOP_LENGTH)

    # To calculate the starting (frame) index for a time t, call:
    return librosa.core.time_to_frames(np.array([time]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH)[0]

In [13]:
def stft_frame_to_time(col_ix):
    # First, calculate column IX to sample_ix
    # sample_ix = col_ix * config.HOP_LENGTH
    # Then divide by the sample rate to get time
    # time = sample_ix / config.SAMPLE_RATE
    # return float(col_ix * config.HOP_LENGTH) / float(config.SAMPLE_RATE)

    # To calculate the time of a (frame) index i, call:
    return librosa.core.frames_to_time(np.array([col_ix]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH)[0]

In [20]:
# In the original model, we fix a length for our audio sample of `1720`. 
# This corresponds to /almost/ 10 seconds - specifically 9.98
# We assume that longer time periods correspond to better predictions, 
# while shorter time periods correspond to faster training times and faster predictions. 
original_cols = 1720
stft_frame_to_time(1720)

9.984580498866213

In [21]:
# Some more possible audio lengths, and the sizes they correspond to: 
times = [1.0, 3.0, 5.0, 10.0, 15.0, 20.0, 30.0]
time_to_stft_frame(times)

array([ 172,  516,  861, 1722, 2583, 3445, 5167])

In [22]:
# And some possible sizes, and the audio lengths they correspond to: 
sizes = [64,128,256,512,1024,2048,4096]
stft_frame_to_time(sizes)

array([ 0.37151927,  0.74303855,  1.4860771 ,  2.9721542 ,  5.94430839,
       11.88861678, 23.77723356])