In [None]:
#!wget 'https://kern.humdrum.org/cgi-bin/ksdata?l=essen/europa/deutschl&format=recursive' --wait 200

--2024-03-06 04:30:22--  https://kern.humdrum.org/cgi-bin/ksdata?l=essen/europa/deutschl&format=recursive
Resolving kern.humdrum.org (kern.humdrum.org)... 171.67.229.81
Connecting to kern.humdrum.org (kern.humdrum.org)|171.67.229.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/x-zip-compressed]
Saving to: ‘ksdata?l=essen%2Feuropa%2Fdeutschl&format=recursive’

ksdata?l=essen%2Feu     [  <=>               ]   3.19M  9.36MB/s    in 0.3s    

2024-03-06 04:30:38 (9.36 MB/s) - ‘ksdata?l=essen%2Feuropa%2Fdeutschl&format=recursive’ saved [3342261]



In [None]:
#!unzip '/content/drive/MyDrive/Colab Notebooks/melody_generator_lstm/Data_pre_processing/ksdata?l=essen%2Feuropa%2Fdeutschl&format=recursive'

In [None]:
#!mv '/content/drive/MyDrive/Colab Notebooks/melody_generator_lstm/Data_pre_processing/essen/europa/deutschl' '/content/drive/MyDrive/Colab Notebooks/melody_generator_lstm/Data_pre_processing'

### Load songs
### filter out songs that have non-acceptable durations
### transpose songs to Cmaj/Amin
### encode songs with music time series representation
### save songs to text file

In [None]:
!pip install music21



In [None]:
import os
import music21 as m21
import json
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

In [None]:
from typing import Sequence
KERN_DATASET_PATH = '/content/drive/MyDrive/Colab Notebooks/melody_generator_lstm/Data_pre_processing/deutschl/erk'
ACCEPTABLE_DURATION = [
    0.25,
    0.5,
    0.75,
    1.0,
    1.5,
    2.0,
    3.0,
    4.0
]
SAVE_DIR = '/content/drive/MyDrive/Colab Notebooks/melody_generator_lstm/Data_pre_processing/dataset'
SINGLE_FILE_DATASET = "file_dataset"
SEQUENCE_LENGTH = 64
MAPPING_PATH = "mapping.json"

In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks/melody_generator_lstm/Data_pre_processing"

/content/drive/MyDrive/Colab Notebooks/melody_generator_lstm/Data_pre_processing


In [None]:
def load_songs_in_kern(dataset_path):
  songs = []

  for path, subdirs, files in os.walk(dataset_path):
    for file in files:
      if file[-3:] == 'krn':
        song = m21.converter.parse(os.path.join(path, file))
        songs.append(song)
  return songs

In [None]:
def has_acceptable_durations(song, acceptable_duration):
  for note in song.flatten().notesAndRests:
    if note.duration.quarterLength not in acceptable_duration:
      return False
  return True

In [None]:
def transpose(song):
  parts = song.getElementsByClass(m21.stream.Part)
  measures_part0 = parts[0].getElementsByClass(m21.stream.Measure)
  key = measures_part0[0][4]

  if not isinstance(key, m21.key.Key):
    key = song.analyze("key")

  if key.mode == 'major':
    interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('C'))
  else:
    interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('A'))

  transpose_song = song.transpose(interval)
  return transpose_song


In [None]:
def encode_song(song, time_step= 0.25):
  encoded_song = []
  for event in song.flatten().notesAndRests:
    if isinstance(event, m21.note.Note):
      symbol = event.pitch.midi
    elif isinstance(event, m21.note.Rest):
      symbol = 'r'

    #convert the note/rest into time series notation
    steps = int(event.duration.quarterLength / time_step)
    for step in range(steps):
      if step == 0:
        encoded_song.append(symbol)
      else:
        encoded_song.append("_")
  encoded_song = " ".join(map(str, encoded_song))
  return encoded_song



In [None]:
def preprocess(dataset_path):
  print("Loading songs...")
  songs = load_songs_in_kern(dataset_path)
  print(f"Loaded {len(songs)} songs.")
  for i, song in enumerate(songs):
    if not has_acceptable_durations(song, ACCEPTABLE_DURATION):
      continue


    song = transpose(song)

    encoded = encode_song(song)
    save_path = os.path.join(SAVE_DIR, str(i))
    with open(save_path, 'w') as fp:
      fp.write(encoded)
  print("Done")






In [None]:
def load(file_path):
  with open(file_path, 'r') as f:
    song = f.read()
  return song

In [None]:
def create_single_file_dataset(dataset_path, file_dataset_path, sequence_length):
  new_song_delimeter = "/ " * sequence_length
  songs = ""
  # load encoded songs and add delimeters
  for path, _, files in os.walk(dataset_path):
    for file in files:
      file_path = os.path.join(path, file)
      song = load(file_path)
      songs += song + " " + new_song_delimeter
  songs = songs[:-len(new_song_delimeter)]

  #save string that contains all dataset
  with open(file_dataset_path, "w") as f:
    f.write(songs)
  return songs

In [None]:
def create_mapping(songs, mapping_path):
  mappings = {}

  # identify the vocabulary
  songs = songs.split()
  vocabulrary = list(set(songs))

  for i, symbol in enumerate(vocabulrary):
    mappings[symbol] = i

  #save mapping
  with open(mapping_path, "w") as f:
    json.dump(mappings, f, indent = 4)


In [None]:
def convert_songs_to_int(songs):
  int_songs = []

  with open(MAPPING_PATH, "r") as f:
    mappings = json.load(f)

  songs = songs.split()

  for symbol in songs:
    int_songs.append(mappings[symbol])
  return int_songs


In [None]:

def generating_training_sequences(sequence_length):
  # [11, 12, 13, 14, ...] = input: [11, 12], target = 13; input: [12, 13], target: 14

  #load songs and map them to int
  songs = load(SINGLE_FILE_DATASET)
  int_songs = convert_songs_to_int(songs)
  inputs = []
  targets = []

  #generate the training sequences
  # 100 symbols and 64 sequence length --> 100 - 64 = 36
  number_sequences = len(int_songs) - sequence_length
  for i in range(number_sequences):
    inputs.append(int_songs[i:i+sequence_length])
    targets.append(int_songs[i+sequence_length])


  #one-hot encode the sequences
  #inputs: (# of sequences, sequence length, vocabsize)
  # [[0, 1, 2], [1, 1, 2]] -> [[[1, 0, 0], [0, 1, 0], [0, 0, 1]], [[0, 1, 0], [0, 1, 0], [0, 0, 1]]]
  vocabulary_size = len(set(int_songs))
  inputs = keras.utils.to_categorical(np.array(inputs), num_classes = vocabulary_size)
  targets = np.array(targets)
  return inputs, targets



In [None]:
songs = load_songs_in_kern(KERN_DATASET_PATH)
# print(f"Loaded {len(songs)} songs.")
# song = songs[0]

# print(f"Has acceptable durations: {has_acceptable_durations(song, ACCEPTABLE_DURATION)}")
# print(f"Song duration: {song.flat.notesAndRests[0].duration.quarterLength}")

# transposed_song = transpose(song)
# print(f"Has acceptable durations: {has_acceptable_durations(transposed_song, ACCEPTABLE_DURATION)}")
# print(f"Song duration: {transposed_song.flat.notesAndRests[0].duration.quarterLength}")

preprocess(KERN_DATASET_PATH)
songs = create_single_file_dataset(SAVE_DIR, SINGLE_FILE_DATASET, SEQUENCE_LENGTH)
create_mapping(songs, MAPPING_PATH)
inputs, targets = generating_training_sequences(SEQUENCE_LENGTH)
print(inputs)
print(targets)


Loading songs...
Loaded 1700 songs.
Done
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0