In [12]:
from __future__ import print_function
import scipy.io.wavfile as wavfile
import scipy
import scipy.fftpack
import numpy as np
from matplotlib import pyplot as plt
import glob
import librosa
import pickle
import os
from music21 import converter, instrument, note, chord, stream
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, CuDNNLSTM
from keras.layers import Activation
from keras.layers import BatchNormalization as BatchNorm
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [79]:
#for each wav file in the dataset folder, I combined their fft into a single array in order to implement LSTM
output = []
for file in glob.glob('wav/*.wav'):
    y, sr = librosa.load(file) #y is signal,sr is sample rate
    fft = scipy.fft(y)
    fft = list(fft)
    output.append(fft)

In [None]:
output = np.array(output)#output the array as a npy for training on google colab
np.save('output.npy', output)

In [None]:
fft = np.load('/content/gdrive/My Drive/1.npy')
#Following determines the exact unique value in the array. 
#Unfortunately, most of the frequencies are unqiue, which is challneging for the neural network.
vocab = sorted(list(set(fft)))
data_size, vocab_size = len(fft), len(vocab)
print('data has %d freq, %d unique freq.' % (data_size, vocab_size))

In [None]:
#number and store the frequencies
note_to_int = dict((note, number) for number, note in enumerate(vocab))
int_to_note = dict((number, note) for number, note in enumerate(vocab))

In [None]:
seq_len = 1000
X=[]
Y=[]
for i in range(0, len(fft) - seq_len, 1000):
    x = fft[i:i + seq_len]
    y = fft[i+500: i + seq_len +500]        
    X.append([note_to_int[m] for m in x])
    Y.append([note_to_int[m] for m in y])
n_patterns = len(X)
print("Total Patterns: ", n_patterns)

In [None]:
X = np.array(X)/len(X)
Y = np.array(Y)/len(Y)

In [None]:
X = np.reshape(X, (6825, 1000, 1))

In [None]:
model = Sequential()
model.add(LSTM(
        512,
        input_shape=(X.shape[1], X.shape[2]),
        recurrent_dropout=0.3,
        return_sequences=True
    ))
model.add(CuDNNLSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(CuDNNLSTM(512, return_sequences=False))
model.add(BatchNorm())
model.add(Dropout(0.3))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(BatchNorm())
model.add(Dropout(0.3))
model.add(Dense(1000))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics = ['accuracy'])

In [None]:
from keras.callbacks import *
filepath="/content/gdrive/My Drive/fftacc:{acc:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, Y, epochs=1,verbose=1, batch_size=128, callbacks=callbacks_list)

In [None]:
out = model.predict(X)
out = out*len(X)

In [None]:
laji = []
for sample in out:
  note = int_to_note[int(max(sample))]
  laji.append(note)
laji = np.array(laji)
np.save('laji.npy', laji)

In [8]:
#Here is the postprocessing, it converts the new fft generated by the network and takes the inverse of it.
fft = np.load('laji.npy')

In [9]:
inv = scipy.ifft(fft)

In [10]:
rinv = [i.real for i in inv]*10000 
rinv = np.array(rinv)

In [11]:
scipy.io.wavfile.write('laji.wav',sr,rinv)#Write the inverse fft into a wav file. Here sr doesn't matter as much as  
#as the signal which is the main component for the tune and sound.