In [1]:
import vamp
import librosa
import numpy as np
import wave
from __future__ import print_function
#from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

ModuleNotFoundError: No module named 'vamp'

In [3]:
def wavwrite(x, filename, fs=44100, N=16):
    
    maxVol = 2**15-1.0 # maximum amplitude
    x = x * maxVol # scale x
    # convert x to string format expected by wave
    signal = b"".join((wave.struct.pack('h', int(item)) for item in x))
    wv = wave.open(filename, 'w')
    nchannels = 1
    sampwidth = int(N / 8) # in bytes
    framerate = fs
    nframe = 0 # no limit
    comptype = 'NONE'
    compname = 'not compressed'
    wv.setparams((nchannels, sampwidth, framerate, nframe, comptype, compname))
    wv.writeframes(signal)
    wv.close()

In [4]:
def melosynth(melody, timestamps, outputfile, fs, nHarmonics, square, useneg):
    
    # Preprocess input parameters
    fs = int(float(fs))
    nHarmonics = int(nHarmonics)
    if outputfile is None:
        outputfile = "melosynth.wav"
    else:
        outputfile = outputfile + "_melosynth.wav"

    # Load pitch sequence
    times = timestamps
    freqs = melody

    # Preprocess pitch sequence
    if useneg:
        freqs = np.abs(freqs)
    else:
        freqs[freqs < 0] = 0
    # Impute silence if start time > 0
    if times[0] > 0:
        estimated_hop = np.median(np.diff(times))
        prev_time = max(times[0] - estimated_hop, 0)
        times = np.insert(times, 0, prev_time)
        freqs = np.insert(freqs, 0, 0)

    signal = []

    translen = 0.010 # duration (in seconds) for fade in/out and freq interp
    phase = np.zeros(nHarmonics) # start phase for all harmonics
    f_prev = 0 # previous frequency
    t_prev = 0 # previous timestamp
    for t, f in zip(times, freqs):

        # Compute number of samples to synthesize
        nsamples = int(np.round((t - t_prev) * fs))

        if nsamples > 0:
            # calculate transition length (in samples)
            translen_sm = float(min(np.round(translen*fs), nsamples))

            # Generate frequency series
            freq_series = np.ones(nsamples) * f_prev

            # Interpolate between non-zero frequencies
            if f_prev > 0 and f > 0:
                freq_series += np.minimum(np.arange(nsamples)/translen_sm, 1) *\
                               (f - f_prev)
            elif f > 0:
                freq_series = np.ones(nsamples) * f

            # Repeat for each harmonic
            samples = np.zeros(nsamples)
            for h in range(nHarmonics):
                # Determine harmonic num (h+1 for sawtooth, 2h+1 for square)
                hnum = 2*h+1 if square else h+1
                # Compute the phase of each sample
                phasors = 2 * np.pi * (hnum) * freq_series / float(fs)
                phases = phase[h] + np.cumsum(phasors)
                # Compute sample values and add
                samples += np.sin(phases) / (hnum)
                # Update phase
                phase[h] = phases[-1]

            # Fade in/out and silence
            if f_prev == 0 and f > 0:
                samples *= np.minimum(np.arange(nsamples)/translen_sm, 1)
            if f_prev > 0 and f == 0:
                samples *= np.maximum(1 - (np.arange(nsamples)/translen_sm), 0)
            if f_prev == 0 and f == 0:
                samples *= 0

            # Append samples
            signal.extend(samples)

        t_prev = t
        f_prev = f

    # Normalize signal
    signal = np.asarray(signal)
    signal *= 0.8 / float(np.max(signal))

    wavwrite(np.asarray(signal), outputfile, fs)

In [5]:
audio_file = 'all_star.wav'

In [6]:
audio, sr = librosa.load(audio_file, sr=44100, mono=True)

In [7]:
data_noparams = vamp.collect(audio, sr, "mtg-melodia:melodia")

In [7]:
hop_nopar, melody_nopar = data_noparams['vector']
melody_nopar

array([-220., -220., -220., ..., -220., -220., -220.], dtype=float32)

In [8]:
timestamps_nopar = 8 * 128/44100.0 + np.arange(len(melody_nopar)) * (128/44100.0)
timestamps_nopar

array([2.32199546e-02, 2.61224490e-02, 2.90249433e-02, ...,
       2.03197823e+02, 2.03200726e+02, 2.03203628e+02])

In [9]:
melosynth(melody_nopar, timestamps_nopar, 'original', 16000, 1, False, False)

In [9]:
audio_file2 = 'yo.wav'

In [10]:
audio2, sr2 = librosa.load(audio_file2, sr=44100, mono=True)

In [11]:
data_noparams2 = vamp.collect(audio2, sr2, "mtg-melodia:melodia")

In [12]:
hop_nopar2, melody_nopar2 = data_noparams2['vector']
melody_nopar2

array([-110., -110., -110., ..., -110., -110., -110.], dtype=float32)

In [14]:
timestamps_nopar2 = 8 * 128/44100.0 + np.arange(len(melody_nopar2)) * (128/44100.0)
timestamps_nopar2

array([ 0.02321995,  0.02612245,  0.02902494, ..., 14.92172336,
       14.92462585, 14.92752834])

In [15]:
melosynth(melody_nopar2, timestamps_nopar2, 'hum', 16000, 1, False, False)

In [13]:
distance, path = fastdtw(melody_nopar2, melody_nopar, dist=euclidean)

In [14]:
distance

18206396.885017395

In [15]:
audio_file3 = 'cant_stop.wav'
audio3, sr3 = librosa.load(audio_file3, sr=44100, mono=True)
data_noparams3 = vamp.collect(audio3, sr3, "mtg-melodia:melodia")
hop_nopar3, melody_nopar3 = data_noparams3['vector']

In [16]:
distance2, path2 = fastdtw(melody_nopar2, melody_nopar3, dist=euclidean)
distance2

31572157.43793869

In [17]:
audio_file4 = 'portugal.wav'
audio4, sr4 = librosa.load(audio_file4, sr=44100, mono=True)
data_noparams4 = vamp.collect(audio4, sr4, "mtg-melodia:melodia")
hop_nopar4, melody_nopar4 = data_noparams4['vector']

In [18]:
distance3, path3 = fastdtw(melody_nopar2, melody_nopar4, dist=euclidean)
distance3

20057704.570510864