**TODO**

<ol>
<li>Map the note names to a MIDI number like scale for analyzing intervals</li>
<li>Create a tonic dict for existing makamlar</li>
<li>Relation between measures are indicated how? (Extensions, glissando, my vocab is poor here)</li>
<li>Deal with Multi-part scores. Probably not possible in makam music</li>
</ol>

**Questions**
- XML
    - What is alter? I think numeric value indicating accidental?
    - What is "divisions" attribute?
- Makam Music
    - Can makam music have time signature change?

```Measure
    0: attributes ( even if no note is present) 
        sometimes empty, o.w.
        'divisions'
        'key'
        'time'
    1,.. notes ```

In [1]:
# Code Improved from https://github.com/burakuyar/Tools/blob/master/musicxml_player.py
# Data from https://github.com/sertansenturk/SymbTr

import os
import glob
from collections import defaultdict

import numpy as np

import xml.etree.ElementTree as ET

DATA_DIR=os.path.join(os.getcwd(), 'data')

In [20]:
def read_score(score_path):
    tree=ET.parse(score_path)
    root=tree.getroot()
    return root

# TODO: get from xml not path
def get_composer_info(score_path):
    file_name=os.path.splitext(os.path.basename(score_path))[0]
    composition_name=file_name.split('--')[-2]
    composer_name=file_name.split('--')[-1]
    return composition_name,composer_name 

def get_makam_form_usul(root):
    makam_form_usul=root.find('part/measure/direction/direction-type/words').text.split(', ')
    makam=makam_form_usul[0].split(': ')[-1]
    form=makam_form_usul[1].split(': ')[-1]
    usul=makam_form_usul[2].split(': ')[-1][:-1]
    return makam, form, usul

def parse_notes(root, record_embellishment=True):
    """
    Returns a 2D array of [[measure_idx,note_idx,note_duration,note_name]]
    Note name can only be "PitchClass Octave", "PitchClass Octave Accidental" or "Rest"
    If a note_name is not Rest and it has zero duration, that note is an embellishment.
    """
    notes=[]
    for m_idx,measure in enumerate(root.findall('part/measure')):   
        if len(measure.findall('note'))>0: # Check if the measure contains at least one note
            grace_count=0 # Count grace notes in case you don't want to record them
            for n_idx,note in enumerate(measure.findall('note')):
                dur=note.find('duration')
                if dur is None:
                    if not record_embellishment:
                        grace_count+=1
                        continue # skip the grace note
                    else:
                        dur='0' # Embellishment/Grace Note
                else:
                    dur=dur.text
                step=note.find('pitch/step')
                if step is not None:
                    step=step.text
                    octave=note.find('pitch/octave').text
                    acc = note.find('accidental')
                    if acc is None:
                        n='{}{}'.format(step, octave)
                    else:
                        n='{}{} {}'.format(step, octave, acc.text)
                else:
                    rest = note.find('rest')
                    assert rest is not None, "The note doesn't have a pitch and is not a rest!"
                    n='Rest'
                note = [m_idx, n_idx-grace_count, dur, n]
                notes.append(note)
    return np.array(notes)

def get_time_signatures(root):
    """Returns all time signatures in the score as a list of tupples.
    Assumes it is possible to have time change in makam pieces."""
    beats = [t.text for t in root.findall('part/measure/attributes/time/beats')]
    types = [t.text for t in root.findall('part/measure/attributes/time/beat-type')]
    all_time_signatures=[(int(b),int(t)) for b,t in zip(beats,types)]
    return all_time_signatures

def get_bpm(root):
    return float(root.find('part/measure/direction/sound').attrib['tempo'])

def get_divisions(root):
    return float(root.find('part/measure/attributes/divisions').text)

def find_key_signature_accidentals(root):
    notes, accidentals = [], []
    for k in root.iter('key'):
        for ks in k.findall('key-step'):
            notes.append(ks.text)
        for ka in k.findall('key-accidental'):
            accidentals.append(ka.text)
    return ['{} {}'.format(n,k) for n,k in zip(notes,accidentals)]

def find_all_accidentals(root):
    return set([a.text for a in root.iter('accidental')])

# 1) Find all xml files in the DATA_DIR

In [13]:
score_paths = glob.glob(DATA_DIR+'/*.xml')
print(f'There are {len(score_paths)} scores in the directory.')

There are 2200 scores in the directory.


# 2) Process

## A) Process 1 XML File

In [6]:
# Read one xml file to a tree structure
score_path=score_paths[0]
root=read_score(score_path)

In [7]:
# Get necessary information
makam,form,usul=get_makam_form_usul(root)
print(f'Makam: {makam}')
print(f'Form: {form}')
print(f'Usul: {usul}')
time_signatures=get_time_signatures(root)
print(f'Time Signature(s): {time_signatures}')
bpm=get_bpm(root)
print(f'BPM: {bpm}')
divs=get_divisions(root)
print(f'Divisions: {divs}')
key_signature_accidentals=find_key_signature_accidentals(root)
print(f'Accidental(s) in the key signature: {key_signature_accidentals}')
notes=parse_notes(root, record_embellishment=False)

Makam: Uşşak
Form: Şarkı
Usul: Aksak
Time Signature(s): [(9, 8)]
BPM: 60.0
Divisions: 96.0
Accidental(s) in the key signature: ['B quarter-flat']


In [8]:
print('Measure, Position, Duration, Name')
print('='*len('Measure, Position, Duration, Name'))
for n in notes:
    print(n)   

Measure, Position, Duration, Name
['0' '0' '96' 'Rest']
['0' '1' '72' 'G4']
['0' '2' '24' 'A4']
['0' '3' '96' 'A4']
['0' '4' '96' 'C5']
['0' '5' '24' 'C5']
['0' '6' '24' 'B4 quarter-flat']
['1' '0' '36' 'B4 quarter-flat']
['1' '1' '12' 'A4']
['1' '2' '24' 'G4']
['1' '3' '24' 'A4']
['1' '4' '24' 'B4 quarter-flat']
['1' '5' '24' 'A4']
['1' '6' '24' 'D5']
['1' '7' '24' 'C5']
['1' '8' '24' 'C5']
['1' '9' '24' 'B4 quarter-flat']
['1' '10' '24' 'B4 quarter-flat']
['1' '11' '24' 'A4']
['1' '12' '48' 'A4']
['1' '13' '48' 'G4']
['1' '14' '24' 'B4 quarter-flat']
['1' '15' '24' 'A4']
['2' '0' '96' 'A4']
['2' '1' '72' 'A4']
['2' '2' '24' 'F5']
['2' '3' '24' 'E5']
['2' '4' '24' 'D5']
['2' '5' '24' 'D5']
['2' '6' '24' 'C5']
['2' '7' '144' 'D5']
['3' '0' '48' 'D5']
['3' '1' '24' 'E5']
['3' '2' '24' 'F5 sharp']
['3' '3' '48' 'G5']
['3' '4' '24' 'F5 sharp']
['3' '5' '24' 'E5']
['3' '6' '48' 'A5']
['3' '7' '24' 'G5']
['3' '8' '24' 'F5']
['3' '9' '24' 'E5']
['3' '10' '24' 'D5']
['3' '11' '24' 'D5']
['3' 

## B) Process All XML Files

In [23]:
# Create a dictionary containing all the necessary information.
dataset=defaultdict(list)
for score_path in score_paths:
    root=read_score(score_path)
    makam,form,usul=get_makam_form_usul(root)
    composition_name,composer_name =get_composer_info(score_path)
    score_dict={'composition': composition_name,
                'composer': composer_name,
                'form': form,
                'usul': usul,
                'time_signatures': get_time_signatures(root),
                'bpm': 0,
                'divs': get_divisions(root),
                'key_signature_accidentals': find_key_signature_accidentals(root),
                'notes': parse_notes(root, record_embellishment=False)
                }
    dataset[makam].append(score_dict)
#print(list(dataset.keys()))    

In [27]:
for makam,dct in dataset.items():
    print(f'{makam}: {len(dct)}')

Uşşak: 118
Nikrîz: 25
Hüzzam: 96
Nihâvent: 130
Muhayyer: 67
Hicaz: 157
Sûzidilârâ: 7
Şerefnümâ: 1
Segâh: 92
Sûzinâk-Zirgüle: 26
Eviç: 34
Şevk-ı-Dil: 2
Rast: 109
Hicazkâr: 79
Tâhir: 31
Hisarbûselik: 20
Kürdîlihicazkâr: 70
Mâhur: 88
Karcığar: 53
Isfahân: 20
Sabâ: 66
Acemkürdî: 37
Dilnişîn: 1
Bûselik: 57
Yegâh: 22
Sultânîyegâh: 22
Beyâtî: 62
Rûy-i-Dilârâ: 1
Hüseynî: 92
Beste-Isfahan: 3
Büzürk: 2
Nihâvend-i-Rûmî: 1
Nişâburek: 26
Segâh-Mâye: 4
Pesendîde: 4
Müstear: 6
Tâhirbûselik: 3
Hicaz-Zirgüle: 7
Hicaz-Hümâyûn: 38
Rûy-i-Irâk: 2
Ferahfezâ: 17
Hicaz-Uzzâl: 12
Şevk'efzâ: 11
Sûzidil: 19
Nevâ: 21
Bestenigâr: 13
Acemaşîrân: 63
Şehnâzbûselik: 8
Kürdî: 14
Hicazaşîrân: 3
Çargâh: 2
Canfezâ: 2
Şehnâz: 25
Evcârâ: 9
Muhayyerkürdî: 32
Beyâtî-Arabân: 7
Sûzinâk: 19
Zâvil: 9
Nihâvend-i-Kebîr: 2
Ferahnümâ: 4
Şedarabân: 15
Dilkeşhâveran: 4
Irak: 5
Pençgâh-ı-Zâid: 2
Çargâh(Yeni): 7
Hüseynîaşîrân: 5
Acem: 7
Nev'eser: 8
Rast-ı-Cedîd: 1
Nühüft: 3
Müberka: 1
Gerdâniye: 26
Sabâ-Zemzeme: 3
Hicazkâr-Kürdî: 1
Zîref

In [30]:
# Remove makamlar without enough scores
THRESHOLD=50
subset={k:v for k,v in dataset.items() if len(v)>=THRESHOLD}
count=0
for makam,dct in subset.items():
    print(f'{makam}: {len(dct)}')
    count+=len(dct)
print(count)

Uşşak: 118
Hüzzam: 96
Nihâvent: 130
Muhayyer: 67
Hicaz: 157
Segâh: 92
Rast: 109
Hicazkâr: 79
Kürdîlihicazkâr: 70
Mâhur: 88
Karcığar: 53
Sabâ: 66
Bûselik: 57
Beyâtî: 62
Hüseynî: 92
Acemaşîrân: 63
1399


In [25]:
single_makam=dataset['Muhayyer']