In [None]:
import time
start_time = time.time()

import tensorflow_datasets as tfds
import tensorflow as tf
# https://keithito.com/LJ-Speech-Dataset/ 
# Each audio file is a single-channel 16-bit PCM WAV with a sample rate of 22050 Hz.
ds = tfds.load('ljspeech', split='train', batch_size=16)

def convert_to_spectogram(speech):
    normalized = tf.cast(speech, tf.float32) / 32767.0
    stft = tf.signal.stft(normalized, frame_length=1024, frame_step=256, fft_length=1024)
    return tf.abs(stft)

def update(data):
    data['speech2'] = tf.py_function(func=convert_to_spectogram, inp=[data['speech']], Tout=tf.float32)
    return data

def calculate(data):
    pass

ds = ds.map(update, num_parallel_calls=6)
ds = ds.prefetch(2)

for spectogram in ds.take(1):
    calculate(spectogram)

print("--- %s seconds ---" % (time.time() - start_time))

In [91]:
(train_ds, val_ds, test_ds), metadata = tfds.load('ljspeech',
                                                  with_info=True,
                                                  split=[
                                                      'train[:80%]',
                                                      'train[80%:90%]',
                                                      'train[90%:100%]'
                                                  ],
#                                                   shuffle_files=True,
                                                  batch_size=16)

sr = metadata.metadata['sample_rate']

def convert_log_mel_spectrograms(spectrograms,
                                 sample_rate,
                                 num_spectrogram_bins,
                                 lower_edge_hertz=80.0,
                                 upper_edge_hertz=7600.0,
                                 num_mel_bins=26):

    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
        upper_edge_hertz)

    mel_spectrograms = tf.tensordot(
        spectrograms, linear_to_mel_weight_matrix, 1)

    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    return log_mel_spectrograms

def convert_to_spectogram(speech):
    normalized = tf.cast(speech, tf.float32) / 32767.0
    stfts = tf.signal.stft(normalized, frame_length=1024, frame_step=256, fft_length=1024)
    spectrograms = tf.abs(stfts)
    num_spectrogram_bins = stfts.shape[-1]
    log_mel_spectrograms = convert_log_mel_spectrograms(spectrograms, sr, num_spectrogram_bins)
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)
    return mfccs[0]

def update(data):
    data['speech'] = tf.py_function(func=convert_to_spectogram, inp=[data['speech']], Tout=tf.float32)
    return data

def process_ds(ds):
        return ds.map(
            update, num_parallel_calls=tf.data.experimental.AUTOTUNE
        )
#     .cache().prefetch(1)

train_ds = process_ds(train_ds)
val_ds = process_ds(train_ds)
test_ds = process_ds(train_ds)

for features in train_ds.take(1):
    tf.print(features['speech'].shape)

TensorShape([858, 26])


In [81]:
for features in train_ds.take(2):
    tf.print(features['speech'].shape)
    

TensorShape([858, 26])
TensorShape([814, 26])


In [9]:
import tensorflow_io as tfio
import tensorflow as tf
test = tfio.audio.AudioIOTensor(
    '/Users/gabriel.t.nishimura/projects/masters/datasets/LibriSpeech/dev-clean-new/5694-64025-0000.flac', dtype=None
)
tf.print(test.to_tensor())

[[2]
 [5]
 [5]
 ...
 [4]
 [4]
 [2]]


In [16]:
import tensorflow_io as tfio
path = '/Users/gabriel.t.nishimura/projects/masters/ctc/data_dir/LibriSpeech/dev-clean-new/5694-64025-0000.flac'
raw = tf.io.read_file(path)
decoded = tfio.audio.decode_flac(raw, dtype=tf.int16)
# [[2], [5], [5], ..., [4], [4], [2]]
normalized = tf.cast(decoded, tf.float32) / 32767.0
# [[6.10370189e-05] [0.000152592547] [0.000152592547] ... [0.000122074038] [0.000122074038] [6.10370189e-05]]
spectogram = tf.signal.stft(tf.transpose(normalized), frame_length=512, frame_step=128, fft_length=512)
# [] <- empty
tf.abs(spectogram)


<tf.Tensor: shape=(1, 205, 257), dtype=float32, numpy=
array([[[8.3223893e-04, 3.7094706e-03, 1.3722545e-03, ...,
         2.1628932e-04, 5.4637465e-05, 1.3773417e-04],
        [3.8308473e-03, 4.2985152e-03, 3.4092239e-03, ...,
         9.9384692e-05, 1.5655435e-04, 1.0576041e-04],
        [8.3755178e-04, 2.4562459e-03, 2.9933178e-03, ...,
         9.1293485e-05, 1.4219768e-04, 9.8375516e-05],
        ...,
        [2.4218033e-03, 1.3274888e-03, 2.1213035e-03, ...,
         9.0504291e-05, 1.4621654e-04, 3.4237746e-05],
        [5.4896288e-03, 4.8878239e-03, 2.9209652e-03, ...,
         2.4263980e-04, 1.8727286e-04, 3.5183039e-05],
        [7.9418123e-03, 6.6004079e-03, 4.1228645e-03, ...,
         2.1570237e-04, 1.6987286e-04, 1.4330028e-04]]], dtype=float32)>

In [52]:
batch_size, num_samples, sample_rate = 10, 32000, 16000.0
# A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1].
pcm = tf.random.normal([batch_size, num_samples], dtype=tf.float32)
tf.print(pcm.shape)
tf.print(pcm[0])
# A 1024-point STFT with frames of 64 ms and 75% overlap.
stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
                       fft_length=1024)
spectrograms = tf.abs(stfts)
spectrograms

TensorShape([10, 32000])
[1.09662044 1.69534135 -0.899237037 ... 2.43812156 -0.435678 1.52866685]


<tf.Tensor: shape=(10, 122, 513), dtype=float32, numpy=
array([[[36.42891   , 28.344164  ,  6.5406055 , ...,  8.846258  ,
         20.897167  , 29.22834   ],
        [42.59881   , 38.289036  , 21.444187  , ...,  8.97204   ,
         16.172142  , 16.06566   ],
        [ 5.5205526 , 24.158653  , 19.721468  , ..., 20.894777  ,
         14.119332  , 15.314309  ],
        ...,
        [ 3.6069183 , 13.671073  ,  2.679152  , ..., 35.30774   ,
         32.572678  , 20.800205  ],
        [ 9.032834  , 11.374646  , 14.75252   , ..., 37.143196  ,
         22.568562  ,  0.1557169 ],
        [ 7.8620396 ,  4.4548864 , 13.004418  , ..., 21.504335  ,
          8.973442  ,  6.6855507 ]],

       [[ 5.0369244 ,  6.093818  , 21.205082  , ..., 23.319431  ,
         35.89395   , 46.70936   ],
        [23.914188  , 25.074112  , 25.329699  , ...,  9.3750105 ,
         12.089165  , 27.278046  ],
        [11.466986  , 26.49086   , 14.000098  , ...,  3.8226857 ,
          5.81519   , 17.588863  ],
        ...