# Import

In [None]:
import os
import json
import music21 as m21
import numpy as np
import tensorflow.keras as keras

## Functions

In [None]:

def load_songs(dataset_path):

    songs = []
    for path, _, files in os.walk(dataset_path):
        for file in files:
            if file[-3:] == "krn":
                song = m21.converter.parse(os.path.join(path, file))
                songs.append(song)
    return songs


# song durations
ACCEPTABLE_DURATIONS = [
    0.25, # 16th note
    0.5, # 8th note
    0.75,
    1.0, # quarter note
    1.5,
    2, # half note
    3,
    4 # whole note
]



def has_acceptable_durations(song, acceptable_durations):
    for note in song.flat.notesAndRests:
        if note.duration.quarterLength not in acceptable_durations:
            return False
    return True


def transpose(song):
    # key
    parts = song.getElementsByClass(m21.stream.Part)
    measures_part0 = parts[0].getElementsByClass(m21.stream.Measure)
    key = measures_part0[0][4]

    # estimate key 
    if not isinstance(key, m21.key.Key):
        key = song.analyze("key")

    # transpose
    if key.mode == "major":
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch("C"))
    elif key.mode == "minor":
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch("A"))

    tranposed_song = song.transpose(interval)
    return tranposed_song


def encode_song(song, time_step=0.25):

    encoded_song = []
    for event in song.flat.notesAndRests:
        #notes
        if isinstance(event, m21.note.Note):
            symbol = event.pitch.midi # 60
        # rests
        elif isinstance(event, m21.note.Rest):
            symbol = "r"

        # note/rest into time series
        steps = int(event.duration.quarterLength / time_step)
        for step in range(steps):

            if step == 0:
                encoded_song.append(symbol)
            else:
                encoded_song.append("_")

    encoded_song = " ".join(map(str, encoded_song))
    return encoded_song


def preprocess(dataset_path):
    songs = load_songs(dataset_path)
    for i, song in enumerate(songs):
        # duration filter
        if not has_acceptable_durations(song, ACCEPTABLE_DURATIONS):
            continue
        # transpose
        song = transpose(song)
        # encode
        encoded_song = encode_song(song)

        # save songs to text file
        save_path = os.path.join("dataset", str(i))
        with open(save_path, "w") as fp:
            fp.write(encoded_song)
        if i % 10 == 0:
            print(f"Song {i} out of {len(songs)} processed")


def load(file_path):
    with open(file_path, "r") as fp:
        song = fp.read()
    return song


def create_single_file_dataset(dataset_path, file_dataset_path, sequence_length):

    new_song_delimiter = "/ " * sequence_length
    songs = ""

    # load all songs
    for path, _, files in os.walk(dataset_path):
        for file in files:
            file_path = os.path.join(path, file)
            song = load(file_path)
            songs = songs + song + " " + new_song_delimiter

    # remove empty space
    songs = songs[:-1]

    with open(file_dataset_path, "w") as fp:
        fp.write(songs)

    return songs


def create_mapping(songs, mapping_path):

    mappings = {}
    songs = songs.split()
    vocabulary = list(set(songs))

    # mappings
    for i, symbol in enumerate(vocabulary):
        mappings[symbol] = i

    with open(mapping_path, "w") as fp:
        json.dump(mappings, fp, indent=4)


def convert_songs_to_int(songs):
    int_songs = []

    # load mappings
    with open("mapping.json", "r") as fp:
        mappings = json.load(fp)

    # string to list
    songs = songs.split()

    # map songs to int
    for symbol in songs:
        int_songs.append(mappings[symbol])

    return int_songs


def generate_training_sequences(sequence_length):

    songs = load("file_dataset")
    int_songs = convert_songs_to_int(songs)

    x = []
    y = []

    num_sequences = len(int_songs) - sequence_length
    for i in range(num_sequences):
        x.append(int_songs[i:i+sequence_length])
        y.append(int_songs[i+sequence_length])

    # one-hot
    vocabulary_size = len(set(int_songs))
    # x size: (# of sequences, sequence length, vocabulary size)
    x = keras.utils.to_categorical(x, num_classes=vocabulary_size)
    y = np.array(y)

    return x, y



In [None]:
preprocess("deutschl/test")
songs = create_single_file_dataset(dataset_path="dataset", file_dataset_path="file_dataset", sequence_length=64)
create_mapping(songs, "mapping.json")