# Dataset Split for the ASAP dataset

We take five Chopin compositions from the [ASAP dataset](https://github.com/fosfrancesco/asap-dataset), among which three of the compositions are used for trianing, one for validation, and one for testing.

We use the [v1.2 release](https://github.com/fosfrancesco/asap-dataset/releases) of the ASAP dataset.


In [None]:
import os
import pandas as pd
import pretty_midi as pm
import warnings
warnings.filterwarnings("ignore")


dataset_path = '/mnt/c/Users/lel79bc/Desktop/Datasets/asap-dataset-1.2'  # Please change this to your ASAP dataset path


#################################################
# Create dataset split
#################################################

# Get metadata
metadata = pd.read_csv(os.path.join(dataset_path, 'metadata.csv'))

# Get only the Chopin compositions
metadata = metadata[metadata['composer'] == 'Chopin']
metadata.reset_index(drop=True, inplace=True)

# Expand a "split" column to the right of the dataframe
metadata['split'] = 'train'  # Default to train split

# Get the five compositions with most performances for Chopin
compositions = []
for composition in metadata['title'].unique():
    if len(metadata[metadata['title'] == composition]) > 15:
        compositions.append(composition)
        print('Composition:', composition, '\tNumber of Performances:', len(metadata[metadata['title'] == composition]), '\tSplit:', 'train' if len(compositions) <=3 else 'val' if len(compositions) == 4 else 'test')

# Filter the metadata to only include these compositions
metadata = metadata[metadata['title'].isin(compositions)]
metadata.reset_index(drop=True, inplace=True)

# Set the last composition to the test set and the second last to the validation set
metadata.loc[metadata[metadata['title'] == compositions[-1]].index, 'split'] = 'test'
metadata.loc[metadata[metadata['title'] == compositions[-2]].index, 'split'] = 'val'
        
# Save the metadata with the new split column
metadata.to_csv('ASAP_dataset_split.csv', index=False)
metadata

Composition: Ballades_1 	Number of Performances: 18 	Split: train
Composition: Etudes_op_10_1 	Number of Performances: 25 	Split: train
Composition: Etudes_op_10_4 	Number of Performances: 22 	Split: train
Composition: Etudes_op_10_8 	Number of Performances: 28 	Split: val
Composition: Etudes_op_25_11 	Number of Performances: 24 	Split: test


Unnamed: 0,composer,title,folder,xml_score,midi_score,midi_performance,performance_annotations,midi_score_annotations,maestro_midi_performance,maestro_audio_performance,start,end,audio_performance,split
0,Chopin,Ballades_1,Chopin/Ballades/1,Chopin/Ballades/1/xml_score.musicxml,Chopin/Ballades/1/midi_score.mid,Chopin/Ballades/1/Ali01.mid,Chopin/Ballades/1/Ali01_annotations.txt,Chopin/Ballades/1/midi_score_annotations.txt,,,,,,train
1,Chopin,Ballades_1,Chopin/Ballades/1,Chopin/Ballades/1/xml_score.musicxml,Chopin/Ballades/1/midi_score.mid,Chopin/Ballades/1/BuiJL04M.mid,Chopin/Ballades/1/BuiJL04M_annotations.txt,Chopin/Ballades/1/midi_score_annotations.txt,{maestro}/2017/MIDI-Unprocessed_046_PIANO046_M...,{maestro}/2017/MIDI-Unprocessed_046_PIANO046_M...,,,Chopin/Ballades/1/BuiJL04M.wav,train
2,Chopin,Ballades_1,Chopin/Ballades/1,Chopin/Ballades/1/xml_score.musicxml,Chopin/Ballades/1/midi_score.mid,Chopin/Ballades/1/Day04.mid,Chopin/Ballades/1/Day04_annotations.txt,Chopin/Ballades/1/midi_score_annotations.txt,,,,,,train
3,Chopin,Ballades_1,Chopin/Ballades/1,Chopin/Ballades/1/xml_score.musicxml,Chopin/Ballades/1/midi_score.mid,Chopin/Ballades/1/Dossin05.mid,Chopin/Ballades/1/Dossin05_annotations.txt,Chopin/Ballades/1/midi_score_annotations.txt,,,,,,train
4,Chopin,Ballades_1,Chopin/Ballades/1,Chopin/Ballades/1/xml_score.musicxml,Chopin/Ballades/1/midi_score.mid,Chopin/Ballades/1/JIA06M.mid,Chopin/Ballades/1/JIA06M_annotations.txt,Chopin/Ballades/1/midi_score_annotations.txt,{maestro}/2004/MIDI-Unprocessed_SMF_12_01_2004...,{maestro}/2004/MIDI-Unprocessed_SMF_12_01_2004...,,,Chopin/Ballades/1/JIA06M.wav,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Chopin,Etudes_op_25_11,Chopin/Etudes_op_25/11,Chopin/Etudes_op_25/11/xml_score.musicxml,Chopin/Etudes_op_25/11/midi_score.mid,Chopin/Etudes_op_25/11/SunJay03.mid,Chopin/Etudes_op_25/11/SunJay03_annotations.txt,Chopin/Etudes_op_25/11/midi_score_annotations.txt,,,,,,test
113,Chopin,Etudes_op_25_11,Chopin/Etudes_op_25/11,Chopin/Etudes_op_25/11/xml_score.musicxml,Chopin/Etudes_op_25/11/midi_score.mid,Chopin/Etudes_op_25/11/Tan03.mid,Chopin/Etudes_op_25/11/Tan03_annotations.txt,Chopin/Etudes_op_25/11/midi_score_annotations.txt,,,,,,test
114,Chopin,Etudes_op_25_11,Chopin/Etudes_op_25/11,Chopin/Etudes_op_25/11/xml_score.musicxml,Chopin/Etudes_op_25/11/midi_score.mid,Chopin/Etudes_op_25/11/Tang03.mid,Chopin/Etudes_op_25/11/Tang03_annotations.txt,Chopin/Etudes_op_25/11/midi_score_annotations.txt,,,,,,test
115,Chopin,Etudes_op_25_11,Chopin/Etudes_op_25/11,Chopin/Etudes_op_25/11/xml_score.musicxml,Chopin/Etudes_op_25/11/midi_score.mid,Chopin/Etudes_op_25/11/WangA02M.mid,Chopin/Etudes_op_25/11/WangA02M_annotations.txt,Chopin/Etudes_op_25/11/midi_score_annotations.txt,{maestro}/2015/MIDI-Unprocessed_R1_D2-13-20_mi...,{maestro}/2015/MIDI-Unprocessed_R1_D2-13-20_mi...,,,Chopin/Etudes_op_25/11/WangA02M.wav,test


In [44]:
#################################################
# Print dataset statistics
#################################################

stats_df = pd.DataFrame(columns=['split', 'num_compositions', 'num_performances', 'duration (hours)'])

for split in ['train', 'val', 'test']:
    print(f'Calculating statistics for {split} split...')
    metadata_split = metadata[metadata['split'] == split]
    
    num_compositions = metadata_split['title'].nunique()
    num_performances = len(metadata_split)
    
    # Calculate the total duration
    # Get the midi performance
    duration = 0
    for i, row in metadata_split.iterrows():
        print(f'Get duration for row {i+1}/{len(metadata)}', end='\r')
        midi_performance = os.path.join(dataset_path, row['midi_performance'])
        midi_data = pm.PrettyMIDI(midi_performance)
        duration += midi_data.get_end_time()
    print()
    
    stats_df = stats_df._append({
        'split': split,
        'num_compositions': num_compositions,
        'num_performances': num_performances,
        'duration (hours)': duration / 60 / 60  # Convert to hours
    }, ignore_index=True)

stats_df

Calculating statistics for train split...
Get duration for row 65/117
Calculating statistics for val split...
Get duration for row 93/117
Calculating statistics for test split...
Get duration for row 117/117


Unnamed: 0,split,num_compositions,num_performances,duration (hours)
0,train,3,65,4.211898
1,val,1,28,1.108869
2,test,1,24,1.435413
