In [1]:
import numpy as np
from tqdm import tqdm
import random

In [2]:
dataset = np.load("../data/formatted/dataset.npy", allow_pickle=True)
midi_dataset = np.load("../data/formatted/midi_dataset.npy", allow_pickle=True)
meta_dataset = np.load("../data/formatted/meta_augmented.npy", allow_pickle=True)

print(dataset.shape, midi_dataset.shape, meta_dataset.shape)

(48060, 1024) (48060, 1024, 8) (48060,)


In [None]:
print(dataset[0])

['<style>' 'Jazz' 'Tonality' ... '<pad>' '<pad>' '<pad>']


In [4]:
#Token from dataset
tokens = np.unique(np.concatenate(dataset.tolist()))
print(len(tokens))

np.save("../data/formatted/tokens.npy", list(tokens))

198


In [None]:
stoi = { tk:i for i,tk in enumerate(tokens) }
itos = { i:tk for i,tk in enumerate(tokens) }

print(stoi)

{'.': 0, '/': 1, '0.3997395833333333': 2, '0.4440104166666667': 3, '0.5': 4, '0.5703125': 5, '0.6666666666666666': 6, '0.75': 7, '0.7994791666666666': 8, '0.8880208333333334': 9, '1.0': 10, '1.1419270833333333': 11, '1.3333333333333333': 12, '1.5': 13, '1.5989583333333333': 14, '1.7135416666666667': 15, '128 Feel': 16, '2.0': 17, '2.25': 18, '2.3997395833333335': 19, '2.6666666666666665': 20, '3.0': 21, '4.0': 22, ':|': 23, '<end>': 24, '<pad>': 25, '<start>': 26, '<style>': 27, 'A': 28, 'A major': 29, 'A minor': 30, 'A#': 31, 'A##': 32, 'Ab': 33, 'Ab major': 34, 'Ab minor': 35, 'Abb': 36, 'Afoxé': 37, 'Afro': 38, 'B': 39, 'B major': 40, 'B minor': 41, 'B#': 42, 'B##': 43, 'Baião': 44, 'Ballad': 45, 'Bb': 46, 'Bb major': 47, 'Bb minor': 48, 'Bbb': 49, 'Blues': 50, 'Bolero': 51, 'Bolero-Cha': 52, 'Bossa': 53, 'C': 54, 'C major': 55, 'C minor': 56, 'C#': 57, 'C##': 58, 'Calypso': 59, 'Cb': 60, 'Cbb': 61, 'Cha Cha': 62, 'Chacarera': 63, 'Choro': 64, 'Country Ballad': 65, 'D': 66, 'D major

In [6]:
#Get the number of real songs
realSongs = len(dataset)/12
tenPercent = int(0.1 * realSongs)

#random a number without repeating number
randomList = random.sample(range(0, int(realSongs)), tenPercent)

#if number is bigger than 12 multiply it by 12
for i in range(len(randomList)):
    randomList[i] = randomList[i] * 12

#populate a random list with the 12 subsequent numbers per value
final_random_list=[]
for number in randomList:
    for i in range(12):
        final_random_list.append(number+i)

#check if a number is duplicated 
for i in range(len(final_random_list)):
    for j in range(i+1, len(final_random_list)):
        if final_random_list[i] == final_random_list[j]:
            print("duplicated number")

In [7]:
#save the random list
np.save('../data/formatted/final_random_list.npy', final_random_list)

In [8]:
#split dataset and validation using the random list
dataset_test = dataset[final_random_list]
midi_test = midi_dataset[final_random_list]
meta_test = meta_dataset[final_random_list]

dataset_train = np.delete(dataset, final_random_list, axis=0)
midi_train = np.delete(midi_dataset, final_random_list, axis=0)

In [9]:
import voicing as vc 
import importlib
importlib.reload(vc)
voicing = vc.Voicing()

id = 4
test_this_song = dataset_test[id]
print(test_this_song)

midi, _ = voicing.convert_chords_to_voicing(test_this_song)

voicing.export_to_midi(midi, "test_"+ str(id))

['<style>' 'Samba' 'Tonality' ... '<pad>' '<pad>' '<pad>']
song: 114711_7_10_2024_test_4.mid
file: 114711_7_10_2024_test_4.txt
MIDI file created! 
---------------------------------


'114711_7_10_2024_test_4.mid'

In [10]:
#save the train and test dataset
#test
np.save('../data/formatted/dataset_test.npy', dataset_test)
np.save('../data/formatted/midi_test.npy', midi_test)
np.save('../data/formatted/meta_test.npy', meta_test)
#train
np.save('../data/formatted/dataset_train.npy', dataset_train)
np.save('../data/formatted/midi_train.npy', midi_train)

In [11]:
import random
#create a file with shuffled reference index
def createWindowedShuffleReference(type, size, window, save = False):
    s = np.arange(0, size, 1)
    #num = np.arange(0, len(data)/10, 1)
    np.random.shuffle(s)

    n = int(size/window)
    numlist = random.sample(range(n), n)
    numlist = np.array(numlist)
    numlist = numlist * window

    m = np.max(numlist)
    l_ref = size-window
    print('real:', size, 'max:', m, 'length_ref:',l_ref)

    if m != l_ref:
        rest = m - l_ref
        numlist = numlist - rest

    ref = []
    for num in numlist:
        if num == 0:
            print("OK")
        for i in range(0,window):
            ref.append(num+i)

    #return the shuffled list
    if save:
        np.savetxt("../data/shuffle_" + type + ".txt", ref, fmt='%i', delimiter=" ", header='Array shape: ('+str(size)+', 1)')
    return ref

In [12]:
def getData(folder, name):
    data_path = folder + '/' + name
    data = np.loadtxt(data_path)
    f = open(data_path, "r")
    format = f.readline().replace('# Array shape: (', '').replace('\n', '').replace(')', '')
    format = np.array(format.split(', ')).astype(int)
    f.close()
    return data, format

In [13]:
train_dataset = np.load('../data/formatted/dataset_train.npy', allow_pickle=True)
test_dataset = np.load('../data/formatted/dataset_test.npy', allow_pickle=True)

train_midi = np.load('../data/formatted/midi_train.npy', allow_pickle=True)
test_midi = np.load('../data/formatted/midi_test.npy', allow_pickle=True)

print(train_dataset.shape, train_midi.shape, test_dataset.shape, test_midi.shape)

BATCH_SHUFFLE_SIZE = 1
ref = createWindowedShuffleReference("train", len(train_dataset), BATCH_SHUFFLE_SIZE, True)
ref_test = createWindowedShuffleReference("test", len(test_dataset), BATCH_SHUFFLE_SIZE, True)

# first shuffle the train dataset
shuffle_train, format_train = getData('../data', 'shuffle_train.txt')
shuffle_train = shuffle_train.reshape(format_train[0], ).astype(int)
shuffle_train = shuffle_train.tolist()
dataset = train_dataset[shuffle_train]
midiDataset = train_midi[shuffle_train]

#second shuffle the test dataset
shuffle_test, format_test = getData('../data', 'shuffle_test.txt')
shuffle_test = shuffle_test.reshape(format_test[0], ).astype(int)
shuffle_test = shuffle_test.tolist()
validation = test_dataset[shuffle_test]
midi_validation = test_midi[shuffle_test]

(43260, 1024) (43260, 1024, 8) (4800, 1024) (4800, 1024, 8)
real: 43260 max: 43259 length_ref: 43259
OK
real: 4800 max: 4799 length_ref: 4799
OK


In [19]:
#if the folder shuffle does not exist, create it
import os

# Define the folder path
folder_path = '../data/shuffle'

# Check if the folder exists, if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
else:
    print("Folder already exists")

Folder already exists


In [16]:
np.save('../data/shuffled/dataset_train.npy', dataset)
np.save('../data/shuffled/midi_train.npy', midiDataset)
np.save('../data/shuffled/dataset_test.npy', validation)
np.save('../data/shuffled/midi_test.npy', midi_validation)