In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

# Try out a song from Training Data
# data = the song data
# sr = sample rate
data, sr = librosa.load('/Users/ewais/Documents/Github/tensor-hero/Training Data/Audioslave - Exploder (Chezy)/song.ogg')


In [6]:

# This is a 1D array, with as many elements as there are samples
print('The original data shape is', data.shape)
print('The original sampling rate is', sr)       # sr is 22050 for .ogg files I believe

# Let's upsample to 44.1k so that mp3 files can be handled and to make the 10ms window more accurate when computing STFT w/ hop length
data_new = librosa.resample(data, sr, 44100)
print('After resampling, data shape is', data_new.shape)
print('And the new sr is 44100')

# Take the STFT
S = np.abs(librosa.stft(data_new, n_fft = 2048, hop_length = 441))
print('After taking the STFT w/ 10 ms stride, the shape of the data is', S.shape)

# Create mel filter
melfilter = librosa.filters.mel(44100, n_fft = 2048, n_mels = 80)
print('The shape of the mel filter is', melfilter.shape)

# Let's transform the STFT matrix to the mel filterbank, reducing the dimensionality of the columns to 80
S_filtered = np.matmul(melfilter,S)
print('The shape of the new filtered data is', S_filtered.shape)

# Take the log of the data to better represent human perception
S_filtered = librosa.amplitude_to_db(S_filtered, ref=np.max)

# Prepend and append 7 columns of zeros (corresponding to 70ms of silence before and after song starts)
S_for_parsing = np.c_[np.zeros((80,7)), S_filtered, np.zeros((80,7))]


#S_for_parsing = np.insert(S_for_parsing, range(np.size(S_for_parsing,1)-1,(np.size(S_for_parsing,1)+6)), 0)
print('Before appending zeros, the shape was', S_filtered.shape)
print('After appending zeros, the shape is', S_for_parsing.shape)

The original data shape is (4594459,)
The original sampling rate is 22050
After resampling, data shape is (9188918,)
And the new sr is 44100
After taking the STFT w/ 10 ms stride, the shape of the data is (1025, 20837)
The shape of the mel filter is (80, 1025)
The shape of the new filtered data is (80, 20837)
Before appending zeros, the shape was (80, 20837)
After appending zeros, the shape is (80, 20851)


In [19]:
import sys
np.set_printoptions(threshold=sys.maxsize)  # Unlimited printing size!

# Explore the matrices a bit
print(S_for_parsing[:, 400:500])

# Plot the new filtered dat
#fig, ax = plt.subplots()
#img = librosa.display.specshow(librosa.amplitude_to_db(S_filtered,
#                                                       ref=np.max),
#                               y_axis='log', 
#                               x_axis='time',
#                               ax=ax)
#ax.set_title('Power spectrogram')
#fig.colorbar(img, ax=ax, format="%+2.0f dB")
#plt.show()

1665268 -64.86380005 -64.97917175 -66.66468048
  -68.70995331 -70.19618225 -72.98231506 -74.13946533 -74.69958496
  -75.48220825 -76.4489975  -76.08320618 -75.63806152 -74.31404114
  -74.92678833 -75.54262543 -76.16004944 -75.38430023 -75.83485413
  -78.126297   -78.10687256 -73.7625351  -53.20932007 -46.64245987
  -46.80385208 -48.90982437 -45.29634094 -44.30635071 -44.86243439
  -43.68917847 -43.49992371 -43.39573669 -42.61407471 -43.65540314
  -46.12940216 -46.88053894 -49.37664795 -49.98428726 -50.54374695
  -53.47943115 -55.0037117  -54.7381897  -55.34784698 -57.25799179
  -58.48280334 -59.76126862 -59.71506882 -59.42715836 -58.84620667]
 [-65.61181641 -60.37825775 -51.32759094 -48.89262772 -48.09093475
  -45.64431    -45.95082092 -48.52090073 -50.58997345 -53.03075409
  -53.52177429 -56.22829819 -60.39527512 -63.13809204 -64.63257599
  -66.17304993 -67.45446014 -67.55741882 -67.7453537  -57.90546036
  -53.26990891 -56.17216873 -61.92281723 -61.78857803 -63.2892952
  -65.84571838 