In [1]:
import scipy.io
import librosa
import numpy as np

In [4]:
sr, audio = scipy.io.wavfile.read('/home/dante_gates/repos/music-rec/data/raw/01 - Wal.wav')
l_channel, r_channel = audio[:, 0], audio[:, 1]
sr, l_channel.shape, r_channel.shape

(44100, (11703168,), (11703168,))

In [17]:
S_l = librosa.feature.melspectrogram(l_channel, sr, hop_length=1024)

In [15]:
S_l.shape

(128, 22858)

In [18]:
S_l.shape

(128, 11429)

In [6]:
window_length = 3

def song_length(sr, n_samples):
    return (n_samples / sr)

def load_wav(filename):
    sr, audio = scipy.io.wavfile.read(filename)
    if sr != 44100:
        # consider using:
        # https://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.signal.resample.html
        raise ValueError('%s does not have a sample rate of 44100' % filename)
    if not song_length(sr, audio.shape[0]) > 30:
        print(sr, audio.shape)
        raise ValueError('%s is less than 30s' % filename)
    try:
        l_channel, r_channel = audio[:, 0], audio[:, 1]
    except IndexError:
        raise ValueError('%s is not a stereo file' % filename)
    return sr, l_channel, r_channel

_n_samples = 44100 * window_length
def extract_window(samples):
    mid = int(len(samples))
    return samples[mid-_n_samples:mid+_n_samples]

def make_features(sr, l_channel, r_channel):
    subsample = extract_window(l_channel)
    melspec = librosa.feature.melspectrogram(subsample, sr)
    return np.reshape(melspec, -1)

In [7]:
import glob

break_after = 10
for i, f in enumerate(glob.glob('/home/dante_gates/music_rec/*.wav')):
    sr, audio = scipy.io.wavfile.read(f)
    l_channel, r_channel = audio[:, 0], audio[:, 1]
    S_l = librosa.feature.melspectrogram(l_channel, sr)
    print(f, sr, l_channel.shape, r_channel.shape, S_l.shape, make_features(sr, l_channel, r_channel).shape)
    if i == break_after:
        break

/home/dante_gates/music_rec/10 Cheye.wav 44100 (17718912,) (17718912,) (128, 34608) (33152,)
/home/dante_gates/music_rec/04 Resol.wav 44100 (7699968,) (7699968,) (128, 15040) (33152,)
/home/dante_gates/music_rec/06 Jammi.wav 44100 (9310464,) (9310464,) (128, 18185) (33152,)
/home/dante_gates/music_rec/03 Death.wav 44100 (10864512,) (10864512,) (128, 21220) (33152,)
/home/dante_gates/music_rec/08 Magnu.wav 44100 (22741632,) (22741632,) (128, 44418) (33152,)
/home/dante_gates/music_rec/06 Stars.wav 44100 (24930432,) (24930432,) (128, 48693) (33152,)
/home/dante_gates/music_rec/12 Snake.wav 22050 (6761088,) (6761088,) (128, 13206) (33152,)
/home/dante_gates/music_rec/21 Let's.wav 44100 (7814016,) (7814016,) (128, 15262) (33152,)
/home/dante_gates/music_rec/10 My De.wav 44100 (9395712,) (9395712,) (128, 18352) (33152,)
/home/dante_gates/music_rec/15 Rocki.wav 44100 (9933696,) (9933696,) (128, 19402) (33152,)
/home/dante_gates/music_rec/06 I Use.wav 44100 (9591552,) (9591552,) (128, 18734) 

In [32]:
def gen(filenames, n_features, batch_size=30):
    pos = 0
    for f in filenames:
        if pos == 0:
            arr = np.zeros((batch_size, n_features))
        try:
            contents = load_wav(f)
        except ValueError as e:
            print(e)
        else:
            arr[pos] = make_features(*contents)
            pos += 1
        if pos == batch_size - 1:
            pos = 0
            yield arr, arr

In [9]:
from sklearn.model_selection import train_test_split
files = list(glob.glob('/home/dante_gates/music_rec/*.wav'))
train, test = train_test_split(files, test_size=0.3)

In [10]:
train[:10], test[:10]

(['/home/dante_gates/music_rec/21 Camar.wav',
  '/home/dante_gates/music_rec/12 Good .wav',
  '/home/dante_gates/music_rec/07 I Sta.wav',
  '/home/dante_gates/music_rec/16 The G.wav',
  '/home/dante_gates/music_rec/11 Are Y.wav',
  '/home/dante_gates/music_rec/05 Ginge.wav',
  '/home/dante_gates/music_rec/09 Whack.wav',
  '/home/dante_gates/music_rec/06 My Ch.wav',
  '/home/dante_gates/music_rec/08 Yello.wav',
  '/home/dante_gates/music_rec/12 Why C.wav'],
 ['/home/dante_gates/music_rec/03 The R.wav',
  '/home/dante_gates/music_rec/01 Intro.wav',
  '/home/dante_gates/music_rec/04 Track.wav',
  '/home/dante_gates/music_rec/11 Stuck.wav',
  '/home/dante_gates/music_rec/13 Who W.wav',
  '/home/dante_gates/music_rec/02 Sonne.wav',
  '/home/dante_gates/music_rec/04 Right.wav',
  '/home/dante_gates/music_rec/09 Freed.wav',
  '/home/dante_gates/music_rec/03 Cross.wav',
  '/home/dante_gates/music_rec/07 - Hop.wav'])

In [11]:
from keras.layers import Dense, Input
from keras.models import Model


class AutoEncoder:
    """Autoencoder.

    Wrapper around several `keras.models.Model`s exposing methods for
    training an autoencoder and then encoding and decoding input vectors.
    """
    def __init__(self, input_dim, latent_dim, intermediate_dims):
        """Initialize an ``AutoEncoder``.

        Args:
            input_dim (int): Dimension of the input.
            latent_dim (int): Dimension of the "latent representation" or
            intermediate_dims (list): List of `int`s representing the
                dimmension of the hidden layers up to, but not including, the
                latent layer. See the example below.

        Example
        -------
        The instance

        >>> autoencoder = AutoEncoder(784, 32, [256, 128])

        will have the following architecture ::
            
            |--------- 784 ---------|       INPUT

               |------ 256 ------|

                  |--- 128 ---|

                    |-  32 -|               CODE

                  |--- 128 ---|

               |------ 256 ------|

            |--------- 784 ---------|       OUTPUT


        Usage
        -----
        >>> autoencoder.fit(x_train, validation_data=x_test)
        >>> encodings = autoencoder.encode(x_test)
        >>> decodings = autoencoder.decode(encodings)
        """
        self._encoder = self._decoder = self._model = None
        self._init_encoders(input_dim, latent_dim, intermediate_dims)

    def _init_encoders(self, input_dim, latent_dim, intermediate_dims):
        """Create ``self._model`` for training the autoencoder as well as
        ``self._encoder`` and ``self._decoder`` for encoding/decoding
        output/codes.
        """
        input_ = Input(shape=(input_dim,))
        prev_layer = input_
        for dim in intermediate_dims:
            encoding = Dense(dim, activation='relu')(prev_layer)
            prev_layer = encoding
        encoding = Dense(latent_dim, activation='relu')(prev_layer)

        prev_layer = encoding
        for dim in reversed(intermediate_dims):
            decoding = Dense(dim, activation='relu')(prev_layer)
            prev_layer = decoding
        decoding = Dense(input_dim, activation='sigmoid')(prev_layer)

        # create model used to train autoencoder, ``AutoEncoder.fit()``
        self._model = Model(input_, decoding)
        # create model for encoding images, ``AutoEncoder.encode()``
        self._encoder = Model(input_, encoding)
        # create model for decoding images, ``AutoEncoder.decode()``        
        self._decoder = self._make_decoder(latent_dim, intermediate_dims, self._model)
        self._model.compile(optimizer='adadelta', loss='binary_crossentropy')

    @staticmethod
    def _make_decoder(latent_dim, intermediate_dims, model):
        decoding_layers = model.layers[-1 * len(intermediate_dims) -1:]
        latent = Input(shape=(latent_dim,))
        prev_layer = latent
        for layer in decoding_layers:
            decoding = layer(prev_layer)
            prev_layer = decoding
        return Model(latent, prev_layer)

    def fit(self, input_data, *args, validation_data=None, **kwargs):
        validation_data = (validation_data, validation_data)
        self._model.fit(input_data, input_data,
            validation_data=validation_data, *args, **kwargs)

    def encode(self, x):
        return self._encoder.predict(x)

    def decode(self, x):
        return self._decoder.predict(x)


Using Theano backend.


In [33]:
ae = AutoEncoder(128 * 259, 32, [1024, 256, 128, 64])
ae._model.fit_generator(gen(train, 128*259), 30, validation_data=gen(test, 128*259), validation_steps=30)

Epoch 1/1
/home/dante_gates/music_rec/21 Camar.wav does not have a sample rate of 44100
/home/dante_gates/music_rec/09 Love .wav does not have a sample rate of 44100
 1/30 [>.............................] - ETA: 662s - loss: -610304704.0000/home/dante_gates/music_rec/12 Ms. P.wav does not have a sample rate of 44100
/home/dante_gates/music_rec/02 Vcr.wav does not have a sample rate of 44100
 2/30 [=>............................] - ETA: 625s - loss: -3119611488.0000/home/dante_gates/music_rec/09 Shimm.wav does not have a sample rate of 44100
 3/30 [==>...........................] - ETA: 614s - loss: -4206163349.3333/home/dante_gates/music_rec/07 Shelt.wav does not have a sample rate of 44100
 4/30 [===>..........................] - ETA: 597s - loss: -6423087152.0000/home/dante_gates/music_rec/09 I'm S.wav does not have a sample rate of 44100
 5/30 [====>.........................] - ETA: 575s - loss: -8632033728.0000/home/dante_gates/music_rec/06 No Ti.wav does not have a sample rate of 

Exception in thread Thread-10:
Traceback (most recent call last):
  File "/home/dante_gates/anaconda3/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/dante_gates/anaconda3/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dante_gates/anaconda3/lib/python3.5/site-packages/keras/utils/data_utils.py", line 568, in data_generator_task
    generator_output = next(self._generator)
StopIteration



StopIteration: 