In [1]:
import miditoolkit
import remi_utils as utils
from collections import Counter
import pickle
import glob
import json
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
# create pickle file based on dataset

def extract_events(input_path, chord=False):
    note_items, tempo_items = utils.read_items(input_path)
    note_items = utils.quantize_items(note_items)
    max_time = note_items[-1].end
    if chord:
        chord_items = utils.extract_chords(note_items)
        items = chord_items + tempo_items + note_items
    else:
        items = tempo_items + note_items
    groups = utils.group_items(items, max_time)
    events = utils.item2event(groups)
    return events

In [3]:
all_elements= []
with open('solos.json') as json_file: 
    data = json.load(json_file) 
for i in range(1,910):
    filename = str(i).zfill(3)
    if filename not in data:
        continue
    for j in range(len(data[filename])):
        events = extract_events("./data_dynamic/melody/intro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid") # If you're analyzing chords, use `extract_events(midi_file, chord=True)`
        for event in events:
            element = '{}_{}'.format(event.name, event.value)
            all_elements.append(element)
        events = extract_events("./data_dynamic/melody/middle/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid") # If you're analyzing chords, use `extract_events(midi_file, chord=True)`
        for event in events:
            element = '{}_{}'.format(event.name, event.value)
            all_elements.append(element)
        events = extract_events("./data_dynamic/melody/outro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid") # If you're analyzing chords, use `extract_events(midi_file, chord=True)`
        for event in events:
            element = '{}_{}'.format(event.name, event.value)
            all_elements.append(element)

for i in range(1,910):
    filename = str(i).zfill(3)
    if filename not in data:
        continue
    for j in range(len(data[filename])):
        events = extract_events("./data_dynamic/piano/intro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid") # If you're analyzing chords, use `extract_events(midi_file, chord=True)`
        for event in events:
            element = '{}_{}'.format(event.name, event.value)
            all_elements.append(element)
        events = extract_events("./data_dynamic/piano/middle/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid") # If you're analyzing chords, use `extract_events(midi_file, chord=True)`
        for event in events:
            element = '{}_{}'.format(event.name, event.value)
            all_elements.append(element)
        events = extract_events("./data_dynamic/piano/outro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid") # If you're analyzing chords, use `extract_events(midi_file, chord=True)`
        for event in events:
            element = '{}_{}'.format(event.name, event.value)
            all_elements.append(element)       

counts = Counter(all_elements)
event2word = {c: i for i, c in enumerate(counts.keys())}
word2event = {i: c for i, c in enumerate(counts.keys())}
pickle.dump((event2word, word2event), open('dictionary_dynamic.pkl', 'wb'))

KeyboardInterrupt: 

In [4]:
event2word, word2event = pickle.load(open('dictionary.pkl', 'rb'))

In [5]:
count = 0
intros = []
outros = []
solos = []
with open('solos.json') as json_file: 
    data = json.load(json_file) 
for i in range(1,910):
    filename = str(i).zfill(3)
    if filename not in data:
        continue
    for j in range(len(data[filename])):
        count += 1
        # extract intro
        intro = extract_events("./data_dynamic/melody/intro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid")
        w_intro = utils.event_to_word(intro, event2word)
        intros.append(w_intro)
        # extract outro
        outro = extract_events("./data_dynamic/melody/outro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid")
        w_outro = utils.event_to_word(outro, event2word)
        outros.append(w_outro)
        # extract solo
        solo = extract_events("./data_dynamic/melody/middle/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid")
        w_solo = utils.event_to_word(solo, event2word)
        solos.append(w_solo)
        

In [6]:
len(intros)

750

In [11]:
count = 0
intros_piano = []
outros_piano = []
solos_piano = []
with open('solos.json') as json_file: 
    data = json.load(json_file) 
for i in range(1,910):
    filename = str(i).zfill(3)
    if filename not in data:
        continue
    for j in range(len(data[filename])):
        count += 1
        # extract intro
        intro = extract_events("./data_dynamic/piano/intro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid")
        w_intro = utils.event_to_word(intro, event2word)
        intros_piano.append(w_intro)
        # extract outro
        outro = extract_events("./data_dynamic/piano/outro/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid")
        w_outro = utils.event_to_word(outro, event2word)
        outros_piano.append(w_outro)
        # extract solo
        solo = extract_events("./data_dynamic/piano/middle/" + str(i).zfill(3) + "_solo_" + str(j) + ".mid")
        w_solo = utils.event_to_word(solo, event2word)
        solos_piano.append(w_solo)
        

In [12]:
data = [intros, intros_piano, outros, outros_piano, solos, solos_piano]
pickle.dump(data, open('./solo_generation_dataset_dynamic/solo_generation_dataset.pkl', 'wb'))

In [7]:
data = pickle.load(open('./solo_generation_dataset_dynamic/solo_generation_dataset.pkl', 'rb'))

In [8]:
def find_max_length(series):
    max_length=0
    for i in range(len(series)):
        if max_length < len(series[i]):
            max_length = len(series[i])
    return max_length

def pad_dataset(dataset, word2event):
    pad_value = len(word2event)
    max_length = 0
    for i in range(len(dataset)):
        if max_length < find_max_length(dataset[i]):
            max_length = find_max_length(dataset[i])
    print(max_length)
    for i in range(len(dataset)):
        for j in range(len(dataset[i])):
            while len(dataset[i][j]) < max_length:
                dataset[i][j].append(pad_value)
    return dataset

In [268]:
data_padded = pad_dataset(data,word2event)

1637


In [240]:
pickle.dump(data_padded, open('./solo_generation_dataset_dynamic/solo_generation_dataset_padded.pkl', 'wb'))
data_padded = pickle.load(open('./solo_generation_dataset_dynamic/solo_generation_dataset_padded.pkl', 'rb'))
# data = data_padded

In [9]:
data_text = data
for i in range(len(data)):
    for j in range(len(data[i])):
        string_array = [str(num) for num in data[i][j]]
        data_text[i][j] = ' '.join(string_array)

In [10]:
intros_t, intros_piano_t, outros_t, outros_piano_t, solos_t, solos_piano_t = data_text

In [15]:
intros_train, intros_t_subset, intros_piano_train, intros_piano_t_subset, outros_train, outros_t_subset, outros_piano_train, outros_piano_t_subset, solos_train, solos_t_subset, solos_piano_train, solos_piano_t_subset = train_test_split(intros_t, intros_piano_t, outros_t, outros_piano_t, solos_t, solos_piano_t, test_size=224)

In [16]:
intros_test, intros_valid, intros_piano_test, intros_piano_valid, outros_test, outros_valid, outros_piano_test, outros_piano_valid, solos_test, solos_valid, solos_piano_test, solos_piano_valid = train_test_split(intros_t_subset, intros_piano_t_subset, outros_t_subset, outros_piano_t_subset, solos_t_subset, solos_piano_t_subset, test_size=112)

In [17]:
train = []
val = []
test = []

for i in range(len(intros_train)):
    train.append([intros_train[i],intros_piano_train[i], outros_train[i],outros_piano_train[i],solos_train[i], solos_piano_train[i]])
    
for i in range(len(intros_valid)):
    val.append([intros_valid[i],intros_piano_valid[i], outros_valid[i],outros_piano_valid[i],solos_valid[i], solos_piano_valid[i]])
    
for i in range(len(intros_test)):
    test.append([intros_test[i],intros_piano_test[i], outros_test[i],outros_piano_test[i],solos_test[i], solos_piano_test[i]])
    

In [18]:
df_train = pd.DataFrame(train,columns=['intro', 'intro_piano', 'outro', 'outro_piano','solo', 'solo_piano'])
df_val = pd.DataFrame(test,columns=['intro', 'intro_piano', 'outro', 'outro_piano','solo', 'solo_piano'])
df_test = pd.DataFrame(val,columns=['intro', 'intro_piano', 'outro', 'outro_piano','solo', 'solo_piano'])

In [19]:
destination_folder="solo_generation_dataset_dynamic"
df_train.to_csv(destination_folder + '/train_torchtext.csv', index=False)
df_val.to_csv(destination_folder + '/val_torchtext.csv', index=False)
df_test.to_csv(destination_folder + '/test_torchtext.csv', index=False)

In [218]:
def remove_padding(series, word2event):
    return [value for value in series if value != len(word2event)]

In [211]:
lst_int = [int(x) for x in df_train.values[0][0].split(' ')]
utils.write_midi(remove_padding(lst_int, word2event), word2event, 'test.midi')