In [None]:
import scipy.io.wavfile
import glob
import pretty_midi
import os
import IPython.display as ipd 
import librosa
import time
import matplotlib.pyplot as plt
import librosa.display
import numpy as np
import scipy

In [None]:
midi_root_dir = "/home/faraaz/workspace/music-transcription/data/clean_midi/"
midi_files = glob.iglob(os.path.join(midi_root_dir, '**', '*.mid'))
sf2_path = "/usr/share/sounds/sf2/FluidR3_GM.sf2"

num = 0
start = time.time()

midi_file = "/home/faraaz/workspace/music-transcription/data/clean_midi/Redbone/Come and Get Your Love.mid"
fs = 44100

pm = pretty_midi.PrettyMIDI(midi_file=midi_file)
start = time.time()
pm_samples = pm.fluidsynth(fs=fs, sf2_path=sf2_path)
print("wav gen {}s".format(time.time()-start))
start = time.time()
print("pm_samples {}".format(len(pm_samples)))
print("end time {}s".format(pm.get_end_time()))

In [None]:
pm_iso = pretty_midi.PrettyMIDI()
print(pm.instruments[0].notes[1])
pm_iso.instruments = [pm.instruments[0]]
print(pm_iso.instruments)
pm_iso.instruments[0].notes = [pm_iso.instruments[0].notes[1]]
print(pm_iso.instruments[0].notes)
note_start = pm_iso.instruments[0].notes[0].start
note_end = pm_iso.instruments[0].notes[0].end
note_pitch = pm_iso.instruments[0].notes[0].pitch
note_velocity = pm_iso.instruments[0].notes[0].velocity
print("velocity {}".format(note_velocity))
note_iso_duration = note_end - note_start
print("duration {}s".format(note_iso_duration))
print(pm_iso.instruments[0].notes[0].end)
pm_iso.instruments[0].notes[0].end = note_iso_duration
print(pm_iso.instruments[0].notes[0].end)
pm_iso.instruments[0].notes[0].start = 0.0
pm_iso.instruments[0].notes[0].velocity = 60

start = time.time()
pm_iso_samples = pm_iso.fluidsynth(fs=fs, sf2_path=sf2_path)
print("wav gen {}s".format(time.time()-start))
print("pm_iso_samples {}".format(len(pm_iso_samples)))
print(pm_iso.instruments[0].notes)

In [None]:
# cut out irrelevant sections
sample_duration = int(note_start - note_end)
sample_start = int(note_start * 44100)
sample_end = sample_start + sample_duration

pm_iso_samples = pm_iso_samples[:sample_duration+44100]
pm_samples = pm_samples[sample_start:sample_end+44100]

In [None]:
ipd.Audio(data=pm_samples, rate=44100)

In [None]:
plt.plot(pm_samples)
plt.show

In [None]:
ipd.Audio(data=pm_iso_samples, rate=44100)

In [None]:
plt.plot(pm_iso_samples)
plt.show
# are the relative audio levels affected by synthesizing alone?
# or is this just pitch being more accurate and less averaged

In [None]:
# make the spectrograms
spec_og = librosa.feature.melspectrogram(y=pm_samples, sr=44100)
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(spec_og, ref=np.max), 
                         y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()

In [None]:
spec_iso = librosa.feature.melspectrogram(y=pm_iso_samples, sr=44100)
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(spec_iso, ref=np.max), 
                         y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()

# why does the note look relatively louder?

In [None]:
print(spec_og.shape)
print(spec_iso.shape)
spec_diff = spec_og - spec_iso
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(spec_diff, ref=np.max), 
                         y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()

# how can we convert this back to audio once we have found the mask?
# should the mask predict binary values?

In [None]:
print(note_pitch)  # 81, A5, 880ish
note_hz = pretty_midi.note_number_to_hz(note_pitch)
spec_indicator = spec_og
spec_indicator[note_pitch,:] = 0
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(spec_indicator, ref=np.max), 
                         y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()

In [None]:

# at the end, we have relative note onset / offset, audio w background, audio iso 
# still need to generate mask
# split this code into manageable chunks

In [None]:
og_stft = librosa.core.stft(y=pm_samples)
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(np.abs(og_stft**2), ref=np.max), 
                         y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('STFT')
plt.tight_layout()
print(og_stft.shape)

In [None]:
new_pm_samples = librosa.core.istft(stft_matrix=og_stft)
ipd.Audio(data=new_pm_samples, rate=44100)

In [None]:
def normalize(a):
    a_oo = a - a.real.min() - 1j*a.imag.min() # origin offsetted
    return a_oo/np.abs(a_oo).max()

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
iso_stft = librosa.core.stft(y=pm_iso_samples)
print(iso_stft[0][0])
print(np.min(iso_stft))
print(-1 * np.max(iso_stft))
print(iso_stft[0][0])
print(np.max(iso_stft))
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(np.abs(iso_stft**2), ref=np.max), 
                         y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('STFT')
plt.tight_layout()
print(iso_stft.shape)

print(np.max(np.abs(iso_stft)))
print(np.max(np.abs(og_stft)))

In [None]:
# try making the mask binary mask and then see if it works reasonably

stft_diff = iso_stft / og_stft
new_iso_stft = og_stft * stft_diff
print(stft_diff[0][0])
print(np.max(np.abs(stft_diff)))
print(np.max(stft_diff))
#stft_diff = sigmoid(normalize((og_stft / iso_stft)))
print(stft_diff[0][0])
print(np.max(stft_diff))
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(np.abs(new_iso_stft**2), ref=np.max), 
                         y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('STFT')
plt.tight_layout()
print(stft_diff.shape)

In [None]:
new_iso_samples = librosa.core.istft(stft_matrix=new_iso_stft)
ipd.Audio(data=new_iso_samples, rate=44100)

In [None]:
spec_diff = librosa.feature.melspectrogram(y=diff_samples, sr=44100)
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(spec_diff, ref=np.max), 
                         y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()

In [None]:
plt.imshow(np.abs(stft_diff))
plt.show()


In [None]:
plt.imshow(np.abs(stft_diff), interpolation='nearest')
plt.show()