In [3]:
import music21 as m21
from pathlib import Path
import pickle
import pandas as pd
import json
from collections import Counter

ModuleNotFoundError: No module named 'pandas'

In [None]:
#helper functions and data structures

pitches_dict = {
    0 : ["C","B#","D--"], # nn.Linear(input_size+context_size, 3)
    1 : ["C#","B##","D-"], # nn.Linear(input_size+context_size, 2)
    2 : ["D","C##","E--"], # nn.Linear(input_size+context_size, 3)
    3 : ["D#","E-","F--"],
    4 : ["E","D##","F-"],
    5 : ["F","E#","G--"],
    6 : ["F#","E##","G-"],
    7 : ["G","F##","A--"],
    8 : ["G#","A-"],
    9 : ["A","G##","B--"],
    10 : ["A#","B-","C--"],
    11 : ["B","A##","C-"]
}

accepted_pitches = [ii for i in pitches_dict.values() for ii in i]
print([e for e in enumerate(accepted_pitches)])

double_acc_pitches = [ii for i in pitches_dict.values() for ii in i if ii.endswith("##") or  ii.endswith("--") ]
print(double_acc_pitches)

def score2midi_numbers(score):
    return [p.midi%12 for n in score.flat.notes for p in n.pitches]

def score2pitches(score):
    return [p.name for n in score.flat.notes for p in n.pitches]

def score2onsets(score):
    return [n.offset for n in score.flat.notes for p in n.pitches]

def score2durations(score):
    return [n.duration.quarterLength for n in score.flat.notes for p in n.pitches]

interval_dict = {
    0 : ["P1","d2","A7"], 
    1 : ["m2","A1"], 
    2 : ["M2","d3","AA1"], 
    3 : ["m3","A2"],
    4 : ["M3","d4","AA2"],
    5 : ["P4","A3"],
    6 : ["d5","A4"],
    7 : ["P5","d6","AA4"],
    8 : ["m6","A5"],
    9 : ["M6","d7","AA5"],
    10 : ["m7","A6"],
    11 : ["M7","d1","AA6"]
}

accepted_intervals = [ii for i in interval_dict.values() for ii in i]
print([e for e in enumerate(accepted_intervals)])

def transp_score(score):
    """ For each input return len(accepted_intervals) transposed scores"""
    return [score.transpose(interval) for interval in accepted_intervals]

def transp_note_list(note_list):
    """ For each input return len(accepted_intervals) transposed list of notes"""
    return [[n.transpose(interval) for n in note_list] for interval in accepted_intervals]

def acc_simple_enough(score,accepted_ratio = 0.2 ):
    pitches = score2pitches(score)
    double_acc = sum(el in double_acc_pitches for el in pitches)
    if double_acc/len(pitches) < accepted_ratio:
        return True
    else:
        return False

# OPND dataset

In [3]:
opnd_basepath = Path("datasets/opnd/")

In [82]:
opnd_dataset_dict = []

def parenthetic_contents(string):
    """Parse nested parentheses."""
    stack = []
    for i, c in enumerate(string):
        if c == '(':
            stack.append(i)
        elif c == ')' and stack:
            if len(stack) ==2: # only consider elements at depth 2
                start = stack.pop()
                yield string[start + 1: i]


print("Total scores to process:", len(list(opnd_basepath.iterdir())))
for ifile,file in enumerate(opnd_basepath.iterdir()):
    with open(file,'r') as f:
        file_content = f.read()
    print("Processing file", ifile, str(file))
    strings_list = list(parenthetic_contents(file_content))
    quadruples_list = [s.split(" ") for s in strings_list ]
    # sort by start input and pitch
    quadruples_list = sorted(quadruples_list, key=lambda tup: int(tup[0]))
    # put the information in a list of dicts
    pitches = [q[1].strip('"').replace("n","").replace("s","#").replace("f","-") for q in quadruples_list]
    # transform pitches in music21 notes
    m21_notes = [m21.note.Note(p) for p in pitches]
    # transpose to all intervals
    for interval in accepted_intervals:
        transp_m21_notes = [n.transpose(interval) for n in m21_notes ]
        transp_pitches = [n.pitch.name for n in transp_m21_notes]
        if all(p in accepted_pitches for p in transp_pitches): #consider only ok pitches
            opnd_dataset_dict.append({
                'onset': [int(q[0]) for q in quadruples_list],
                'duration' : [int(q[2]) for q in quadruples_list],
                'pitches': transp_pitches,
                'midi_number' : [n.pitch.midi%12 for n in transp_m21_notes],
                'transposed_of' : interval,
                'key_signature' : None,
                'original_ks' : None,
                'original_path' : str(file)
            })

Total scores to process: 216
Processing file 0 datasets\opnd\bachbgcant000905m.opnd-m
Processing file 1 datasets\opnd\bachbgcant002003m.opnd-m
Processing file 2 datasets\opnd\bachbgcant004206m.opnd-m
Processing file 3 datasets\opnd\bachbgcant004407m.opnd-m
Processing file 4 datasets\opnd\bachbgcant004902m.opnd-m
Processing file 5 datasets\opnd\bachbgcant012201m.opnd-m
Processing file 6 datasets\opnd\bachbgcant013606m.opnd-m
Processing file 7 datasets\opnd\bachbgcant015303m.opnd-m
Processing file 8 datasets\opnd\bachbgcant015308m.opnd-m
Processing file 9 datasets\opnd\bachbgcant016503m.opnd-m
Processing file 10 datasets\opnd\bachbgcant017104m.opnd-m
Processing file 11 datasets\opnd\bachbgchoral027601m.opnd-m
Processing file 12 datasets\opnd\bachbgchoral030201m.opnd-m
Processing file 13 datasets\opnd\bachbgchoral032701m.opnd-m
Processing file 14 datasets\opnd\bachbgchoral035201m.opnd-m
Processing file 15 datasets\opnd\bachbgchoral037701m.opnd-m
Processing file 16 datasets\opnd\bachbgchor

Processing file 131 datasets\opnd\haydndoverquartop55n101m.opnd-m
Processing file 132 datasets\opnd\haydndoverquartop55n302m.opnd-m
Processing file 133 datasets\opnd\haydndoverquartop64n102m.opnd-m
Processing file 134 datasets\opnd\haydndoverquartop64n202m.opnd-m
Processing file 135 datasets\opnd\haydndoversyms-09902m.opnd-m
Processing file 136 datasets\opnd\haydndoversyms-10004m.opnd-m
Processing file 137 datasets\opnd\haydndoversyms-10202m.opnd-m
Processing file 138 datasets\opnd\haydndoversyms-10302m.opnd-m
Processing file 139 datasets\opnd\mozartbhconck45903m.opnd-m
Processing file 140 datasets\opnd\mozartbhconck62202m.opnd-m
Processing file 141 datasets\opnd\mozartbhduosk42302m.opnd-m
Processing file 142 datasets\opnd\mozartbhduosk42402m.opnd-m
Processing file 143 datasets\opnd\mozartbhqrtetsk08003m.opnd-m
Processing file 144 datasets\opnd\mozartbhqrtetsk15602m.opnd-m
Processing file 145 datasets\opnd\mozartbhqrtetsk15802m.opnd-m
Processing file 146 datasets\opnd\mozartbhqrtetsk16

In [78]:
for d in opnd_dataset_dict:
    print(d["midi_number"][:6],d["pitches"][:6],d["transposed_of"],d["original_path"])

[9, 4, 6, 4, 2, 1] ['A', 'E', 'F#', 'E', 'D', 'C#'] P1 datasets\opnd\bachbgcant000905m.opnd-m
[9, 4, 6, 4, 2, 1] ['B--', 'F-', 'G-', 'F-', 'E--', 'D-'] d2 datasets\opnd\bachbgcant000905m.opnd-m
[10, 5, 7, 5, 3, 2] ['B-', 'F', 'G', 'F', 'E-', 'D'] m2 datasets\opnd\bachbgcant000905m.opnd-m
[10, 5, 7, 5, 3, 2] ['A#', 'E#', 'F##', 'E#', 'D#', 'C##'] A1 datasets\opnd\bachbgcant000905m.opnd-m
[11, 6, 8, 6, 4, 3] ['B', 'F#', 'G#', 'F#', 'E', 'D#'] M2 datasets\opnd\bachbgcant000905m.opnd-m
[11, 6, 8, 6, 4, 3] ['C-', 'G-', 'A-', 'G-', 'F-', 'E-'] d3 datasets\opnd\bachbgcant000905m.opnd-m
[0, 7, 9, 7, 5, 4] ['C', 'G', 'A', 'G', 'F', 'E'] m3 datasets\opnd\bachbgcant000905m.opnd-m
[1, 8, 10, 8, 6, 5] ['C#', 'G#', 'A#', 'G#', 'F#', 'E#'] M3 datasets\opnd\bachbgcant000905m.opnd-m
[1, 8, 10, 8, 6, 5] ['D-', 'A-', 'B-', 'A-', 'G-', 'F'] d4 datasets\opnd\bachbgcant000905m.opnd-m
[2, 9, 11, 9, 7, 6] ['D', 'A', 'B', 'A', 'G', 'F#'] P4 datasets\opnd\bachbgcant000905m.opnd-m
[3, 10, 0, 10, 8, 7] ['E-', 'B-

In [85]:
# save dataset
with open('musedata.pkl', 'wb') as fid:
     pickle.dump(opnd_dataset_dict, fid)

# ASAP Dataset

In [8]:
asap_basepath = Path("../asap-dataset/")

In [11]:
# load the dataset info
df = pd.read_csv(Path(asap_basepath,'metadata.csv'))
df = df.drop_duplicates(subset=["title","composer"])

with open(Path(asap_basepath,'asap_annotations.json')) as json_file:
      json_data = json.load(json_file)

# accidentals_list = [list(json_data[m]["midi_score_key_signatures"].values())[0][1] for m  in df["midi_performance"]]
xml_score_paths = list(df["xml_score"])

print("N of pieces: ", len(xml_score_paths))

N of pieces:  222


In [12]:
asap_dataset_dict = []

for i,path in enumerate(xml_score_paths):
    print("About to process",path)
    score = m21.converter.parse(Path(asap_basepath,path))
    # generate the transpositions for the piece
    all_scores = transp_score(score)
    #delete the pieces with non accepted pitches (e.g. triple sharps)
    intervals= []
    scores = []
    for s,interval in zip(all_scores,accepted_intervals):
        if all(pitch in accepted_pitches for pitch in score2pitches(s)):
            scores.append(s)
            intervals.append(interval)
    #append all information to the dictionary 
    asap_dataset_dict.extend([
        {
            'onset':score2onsets(s),
            'duration' : score2durations(s),
            'pitches': score2pitches(s),
            'transposed_of': interval,
            'midi_number' : score2midi_numbers(s),
            'key_signature' : s.parts[0].flat.getElementsByClass(m21.key.KeySignature)[0].sharps,
            'original_ks' : score.parts[0].flat.getElementsByClass(m21.key.KeySignature)[0].sharps,
            'original_path' : str(path),
            'composer' : str(path).split("/")[0]
            } 
        for s,interval in zip(scores,intervals)])

About to process Bach/Fugue/bwv_846/xml_score.musicxml
About to process Bach/Fugue/bwv_848/xml_score.musicxml
About to process Bach/Fugue/bwv_854/xml_score.musicxml
About to process Bach/Fugue/bwv_856/xml_score.musicxml
About to process Bach/Fugue/bwv_857/xml_score.musicxml
About to process Bach/Fugue/bwv_858/xml_score.musicxml
About to process Bach/Fugue/bwv_860/xml_score.musicxml
About to process Bach/Fugue/bwv_862/xml_score.musicxml
About to process Bach/Fugue/bwv_863/xml_score.musicxml
About to process Bach/Fugue/bwv_864/xml_score.musicxml
About to process Bach/Fugue/bwv_865/xml_score.musicxml
About to process Bach/Fugue/bwv_866/xml_score.musicxml
About to process Bach/Fugue/bwv_867/xml_score.musicxml
About to process Bach/Fugue/bwv_868/xml_score.musicxml
About to process Bach/Fugue/bwv_870/xml_score.musicxml
About to process Bach/Fugue/bwv_873/xml_score.musicxml
About to process Bach/Fugue/bwv_874/xml_score.musicxml
About to process Bach/Fugue/bwv_875/xml_score.musicxml
About to p

About to process Chopin/Etudes_op_25/2/xml_score.musicxml
About to process Chopin/Etudes_op_25/4/xml_score.musicxml
About to process Chopin/Etudes_op_25/5/xml_score.musicxml
About to process Chopin/Etudes_op_25/8/xml_score.musicxml
About to process Chopin/Polonaises/53/xml_score.musicxml
About to process Chopin/Scherzos/20/xml_score.musicxml
About to process Chopin/Scherzos/31/xml_score.musicxml
About to process Chopin/Scherzos/39/xml_score.musicxml
About to process Chopin/Sonata_2/1st_no_repeat/xml_score.musicxml
About to process Chopin/Sonata_2/2nd/xml_score.musicxml
About to process Chopin/Sonata_2/3rd/xml_score.musicxml
About to process Chopin/Sonata_2/4th/xml_score.musicxml
About to process Chopin/Sonata_3/2nd/xml_score.musicxml
About to process Chopin/Sonata_3/3rd/xml_score.musicxml
About to process Chopin/Sonata_3/4th/xml_score.musicxml
About to process Debussy/Images_Book_1/1_Reflets_dans_lEau/xml_score.musicxml
About to process Debussy/Pour_le_Piano/1/xml_score.musicxml
About 

In [13]:
# save dataset
with open('asap.pkl', 'wb') as fid:
     pickle.dump(asap_dataset_dict, fid)

In [7]:
#import dataset
with open(Path('datasets','asap.pkl'), 'rb') as fid:
     asap_dataset_dict = pickle.load(fid)
        

[<class 'list'>, <class 'list'>, <class 'list'>, <class 'str'>, <class 'list'>, <class 'int'>, <class 'int'>, <class 'str'>, <class 'str'>]


In [14]:
author_dict = Counter()
for e in asap_dataset_dict:
    author_dict[e["composer"]] +=1

print(author_dict)

Counter({'Bach': 1232, 'Beethoven': 1028, 'Chopin': 508, 'Schubert': 220, 'Haydn': 217, 'Liszt': 212, 'Schumann': 161, 'Mozart': 114, 'Rachmaninoff': 83, 'Ravel': 56, 'Debussy': 28, 'Scriabin': 24, 'Glinka': 21, 'Brahms': 17, 'Prokofiev': 14, 'Balakirev': 7})


In [27]:
#dataset is too big for github, we need to split it in multiple files

baroque_composers = ['Bach','Haydn','Mozart']
classical_composers = ['Beethoven']
romantic_composers = ['Schubert','Schumann','Chopin', 'Brahms']

baroque_asap_dataset_dict = [e for e in asap_dataset_dict
                              if e["composer"] in baroque_composers]

classical_asap_dataset_dict = [e for e in asap_dataset_dict
                              if e["composer"] in classical_composers]

romantic_asap_dataset_dict = [e for e in asap_dataset_dict
                              if e["composer"] in romantic_composers]

In [28]:
with open(Path('datasets','classical_asap.pkl'), 'wb') as fid:
     pickle.dump(classical_asap_dataset_dict, fid)
        
with open(Path('datasets','baroque_asap.pkl'), 'wb') as fid:
     pickle.dump(baroque_asap_dataset_dict, fid)
        
with open(Path('datasets','romantic_asap.pkl'), 'wb') as fid:
     pickle.dump(romantic_asap_dataset_dict, fid)

In [2]:
#load

# load the asap datasets
with open(Path(basepath,'datasets','baroque_asap.pkl'), 'rb') as fid:
     dataset_baroque = pickle.load( fid)

with open(Path(basepath,'datasets','classical_asap.pkl'), 'rb') as fid:
     dataset_classical = pickle.load( fid)

with open(Path(basepath,'datasets','romantic_asap.pkl'), 'rb') as fid:
     dataset_romantic = pickle.load( fid)

# merge the three files together
full_dict_dataset = dataset_baroque + dataset_classical + dataset_romantic
# full_dict_dataset = dataset_baroque + dataset_classical

NameError: name 'basepath' is not defined