In [13]:
import librosa
import numpy as np
import os
import tensorflow as tf
import soundfile as sf

In [14]:
audio_path = 'audio'
# get all wav files in the audio folder
audio_files = [f for f in os.listdir(audio_path) if f.endswith('.wav')]


In [15]:
for af in audio_files:
    ob = sf.SoundFile(os.path.join(audio_path, af))
    print(f'{ob.name}: {ob.samplerate} Hz, {ob.channels} channels, {ob.subtype}')

    # bit depth



audio/Ben Direct FP.wav: 44100 Hz, 1 channels, FLOAT
audio/ts9_test1_in_FP32.wav: 44100 Hz, 1 channels, FLOAT
audio/ht1-input_train.wav: 44100 Hz, 1 channels, PCM_16
audio/Ben Vox Mono.wav: 48000 Hz, 1 channels, PCM_16
audio/tanh5.wav: 48000 Hz, 1 channels, PCM_16
audio/tanh2.wav: 48000 Hz, 1 channels, PCM_16
audio/tanh4.wav: 48000 Hz, 1 channels, PCM_16
audio/ht1-target_test.wav: 44100 Hz, 1 channels, PCM_16
audio/Ben Direct Mono.wav: 48000 Hz, 1 channels, PCM_16
audio/tanh20.wav: 48000 Hz, 1 channels, PCM_16
audio/ts9_test1_out_FP32.wav: 44100 Hz, 1 channels, FLOAT
audio/ht1-input_test.wav: 44100 Hz, 1 channels, PCM_16
audio/ht1-target_train.wav: 44100 Hz, 1 channels, PCM_16
audio/Ben Vox FP.wav: 44100 Hz, 1 channels, FLOAT
audio/Ben Test Mono.wav: 44100 Hz, 1 channels, PCM_16
audio/tanh.wav: 48000 Hz, 1 channels, PCM_16
audio/tanh3.wav: 48000 Hz, 1 channels, PCM_16


In [16]:
for af in audio_files:
    # get wav file info, sample rate and audio data format, bit depth
    y, sr = librosa.load(os.path.join(audio_path, af), sr=None, mono=True)
    bit_depth = librosa.get_samplerate(os.path.join(audio_path, af))
    print(f'{af} sample rate: {sr} audio data format: {y.dtype} bit depth: {bit_depth}')


Ben Direct FP.wav sample rate: 44100 audio data format: float32 bit depth: 44100
ts9_test1_in_FP32.wav sample rate: 44100 audio data format: float32 bit depth: 44100
ht1-input_train.wav sample rate: 44100 audio data format: float32 bit depth: 44100
Ben Vox Mono.wav sample rate: 48000 audio data format: float32 bit depth: 48000
tanh5.wav sample rate: 48000 audio data format: float32 bit depth: 48000
tanh2.wav sample rate: 48000 audio data format: float32 bit depth: 48000
tanh4.wav sample rate: 48000 audio data format: float32 bit depth: 48000
ht1-target_test.wav sample rate: 44100 audio data format: float32 bit depth: 44100
Ben Direct Mono.wav sample rate: 48000 audio data format: float32 bit depth: 48000
tanh20.wav sample rate: 48000 audio data format: float32 bit depth: 48000
ts9_test1_out_FP32.wav sample rate: 44100 audio data format: float32 bit depth: 44100
ht1-input_test.wav sample rate: 44100 audio data format: float32 bit depth: 44100
ht1-target_train.wav sample rate: 44100 audi

In [17]:
for af in audio_files:
    y, sr = librosa.load(os.path.join(audio_path, af))
    # convert to numpy
    y = np.array(y)
    print(f'{af} Max value: {np.max(y)}, Min value: {np.min(y)}, dtype: {y.dtype}')
    y = y.reshape(1, -1, 1)
    print(f'{af} data type: {y.dtype}, shape: {y.shape}')
    # convert to tensor
    y_t = tf.convert_to_tensor(y, dtype=tf.float32)
    print(f'{af} tensor data type: {y_t.dtype}, shape: {y_t.shape}')

Ben Direct FP.wav Max value: 0.9937713146209717, Min value: -1.0031177997589111, dtype: float32
Ben Direct FP.wav data type: float32, shape: (1, 6879600, 1)
Ben Direct FP.wav tensor data type: <dtype: 'float32'>, shape: (1, 6879600, 1)
ts9_test1_in_FP32.wav Max value: 0.6736345887184143, Min value: -0.7237271666526794, dtype: float32
ts9_test1_in_FP32.wav data type: float32, shape: (1, 4117442, 1)
ts9_test1_in_FP32.wav tensor data type: <dtype: 'float32'>, shape: (1, 4117442, 1)
ht1-input_train.wav Max value: 0.5497934222221375, Min value: -0.5439480543136597, dtype: float32
ht1-input_train.wav data type: float32, shape: (1, 7497001, 1)
ht1-input_train.wav tensor data type: <dtype: 'float32'>, shape: (1, 7497001, 1)
Ben Vox Mono.wav Max value: 0.7597728371620178, Min value: -0.7977100610733032, dtype: float32
Ben Vox Mono.wav data type: float32, shape: (1, 6879600, 1)
Ben Vox Mono.wav tensor data type: <dtype: 'float32'>, shape: (1, 6879600, 1)
tanh5.wav Max value: 1.0939185619354248, 

In [18]:
for af in audio_files:
    # load audio file
    y, sr = librosa.load(os.path.join(audio_path, af))
    # get the tempo
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
    print(f'{af}: tempo {tempo}')
    # get the beats
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
    # get the beat times
    beat_times = librosa.frames_to_time(beat_frames, sr=sr)
    print(f'{af}: beat times {beat_times}')
    # get the chroma features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    print(f'{af}: chroma shape {chroma.shape}')
    # get the spectral centroid
    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    print(f'{af}: spectral centroids shape {spectral_centroids.shape}')
    # get the spectral rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
    print(f'{af}: spectral rolloff shape {spectral_rolloff.shape}')
    # get the spectral bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    print(f'{af}: spectral bandwidth shape {spectral_bandwidth.shape}')
    # get the spectral contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    print(f'{af}: spectral contrast shape {spectral_contrast.shape}')
    # get the spectral flatness
    spectral_flatness = librosa.feature.spectral_flatness(y=y)
    print(f'{af}: spectral flatness shape {spectral_flatness.shape}')
    # get the zero crossing rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    print(f'{af}: zero crossing rate shape {zero_crossing_rate.shape}')
    # get the mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    print(f'{af}: mel spectrogram shape {mel_spectrogram.shape}')
    # get the mfccs
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    print(f'{af}: mfccs shape {mfccs.shape}')
    # get the rms
    rms = librosa.feature.rms(y=y)
    print(f'{af}: rms shape {rms.shape}')
    # get the poly features
    poly_features = librosa.feature.poly_features(y=y, sr=sr)
    print(f'{af}: poly features shape {poly_features.shape}')

Ben Direct FP.wav: tempo 123.046875
Ben Direct FP.wav: beat times [  1.81115646   2.11301587   2.41487528   2.71673469   3.0185941
   3.32045351   3.64553288   3.97061224   4.27247166   4.59755102
   4.89941043   5.20126984   5.50312925   5.80498866   6.10684807
   6.40870748   6.71056689   7.03564626   7.38394558   7.7322449
   8.05732426   8.38240363   8.70748299   9.05578231   9.40408163
   9.729161    10.05424036  10.37931973  10.70439909  11.02947846
  11.35455782  11.67963719  12.00471655  12.32979592  12.65487528
  12.97995465  13.30503401  13.63011338  13.95519274  14.28027211
  14.60535147  14.93043084  15.2555102   15.58058957  15.90566893
  16.25396825  16.57904762  16.88090703  17.18276644  17.5078458
  17.83292517  18.20444444  18.55274376  18.87782313  19.20290249
  19.52798186  19.87628118  20.17814059  20.50321995  20.82829932
  21.15337868  21.4552381   21.78031746  22.12861678  22.45369615
  22.77877551  23.10385488  23.4521542   23.80045351  24.14875283
  24.52027211

KeyboardInterrupt: 