In [5]:
import zipfile
from pathlib import Path
import partitura
import pandas as pd
import xml.etree.ElementTree as ET

In [5]:
#unzip all files in zips folder

for zip_file in Path("zips").iterdir():
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(Path("unzipped"))

In [7]:
# check if ts trees are in unzipped folder
for folder in Path("unzipped").iterdir():
    if not any([file.name.startswith("TS") for file in folder.iterdir()]):
        print("not a TS tree in ", folder)

In [None]:
# check if there are scores everywhere
for folder in Path("unzipped").iterdir():
    msc_file = [file for file in folder.iterdir() if file.name.startswith("MSC")]
    numbered_file = [file for file in folder.iterdir() if file.name.startswith(folder.name[:2])]
    assert(len(msc_file)+len(numbered_file) == 1)
    score_file = msc_file[0] if len(msc_file) == 1 else numbered_file[0]
    try:
        score = partitura.load_musicxml(str(score_file))
    except Exception as e:
        print("error loading ", score_file, e)

In [3]:
# build a pandas dataframe with all the scores paths
def get_score_path(folder):
    msc_file = [file for file in folder.iterdir() if file.name.startswith("MSC")]
    numbered_file = [file for file in folder.iterdir() if file.name.startswith(folder.name[:2])]
    assert(len(msc_file)+len(numbered_file) == 1)
    score_file = msc_file[0] if len(msc_file) == 1 else numbered_file[0]
    return str(score_file)

def get_ts_path(folder):
    ts_file = [file for file in folder.iterdir() if file.name.startswith("TS")]
    assert(len(ts_file) == 1)
    return str(ts_file[0])

list_of_tuples = []

for folder in Path("data").iterdir():
    list_of_tuples.append((get_score_path(folder), get_ts_path(folder)))

df = pd.DataFrame(list_of_tuples, columns = ['score', 'ts'])

In [4]:
df

Unnamed: 0,score,ts
0,data\01\01_Waltz in E flat Grande Valse Brilla...,data\01\TS-01.xml
1,data\02\02_Moments Musicaux.xml,data\02\TS-02.xml
2,data\03\03_Bagatelle 'Fur Elise' WoO.59.xml,data\03\TS-03.xml
3,data\04\04_The Preludes Op.28 No.15.xml,data\04\TS-04.xml
4,data\05\05_Turkish March.xml,data\05\TS-05.xml
...,...,...
328,data\95\95_12 Variationen uber ein franzosisch...,data\95\TS-95.xml
329,data\96\96_Lieder ohne Worte Heft 5 Op.62-6 Fr...,data\96\TS-96.xml
330,data\97\97_Les Patineurs Op.183.xml,data\97\TS-97.xml
331,data\98\98_Sonate fur Klavier Nr.8 c moll Path...,data\98\TS-98.xml


# Parse tree annotations from the xml file

In [6]:
# parse the xml file with annotations in a list of dependencies
tree = ET.parse(r'data\01\TS-01.xml')
root = tree.getroot()

In [7]:
for child in root:
    print(child.tag, child.attrib)
    for child2 in child:
        print(child2.tag, child2.attrib)
        for child3 in child2:
            print(child3.tag, child3.attrib)
        break
    break

ts {'timespan': '24.0', 'leftend': '0.0', 'rightend': '24.0'}
head {}
chord {'duration': '3.0', 'velocity': '90'}


In [28]:
root.find("ts").find("primary").find("ts").attrib

{'timespan': '13.0', 'leftend': '11.0', 'rightend': '24.0'}

In [45]:
# def iterative_parse(xml_elem):
#     primary_children = xml_elem.find("ts").find("primary")
#     secondary_children = xml_elem.find("ts").find("secondary")
#     # recursion ending condition
#     if primary_children is None:
#         assert secondary_children is None
#         return xml_elem.find("ts").find("head").find("chord").find("note").attrib["id"]
#     else:
#         assert secondary_children is not None
#         return [iterative_parse(primary_children), iterative_parse(secondary_children)]


# iterative_parse(root)

In [46]:
def iterative_parse(xml_elem):
    primary_children = xml_elem.find("ts").find("primary")
    secondary_children = xml_elem.find("ts").find("secondary")
    if primary_children is None: # recursion ending condition
        assert secondary_children is None
        return [], xml_elem.find("ts").find("head").find("chord").find("note").attrib["id"]
    else: # recursive call
        assert secondary_children is not None
        out_list = [] # dependency list
        iterative_result_primary = iterative_parse(primary_children)
        iterative_result_secondary = iterative_parse(secondary_children)
        # merge the dependencies lists computed deeper
        out_list.extend(iterative_result_primary[0])
        out_list.extend(iterative_result_secondary[0])
        # append the dependency for the current node
        out_list.append((iterative_result_primary[1], iterative_result_secondary[1]))
        # return the dependency list, and the id of the current node, i.e., the primary
        return out_list, iterative_parse(primary_children)[1]


arcs = iterative_parse(root)[0]
arcs

[('P1-7-2', 'P1-7-3'),
 ('P1-7-4', 'P1-7-2'),
 ('P1-7-4', 'P1-7-1'),
 ('P1-8-1', 'P1-7-4'),
 ('P1-6-2', 'P1-6-3'),
 ('P1-6-4', 'P1-6-2'),
 ('P1-6-4', 'P1-6-1'),
 ('P1-5-2', 'P1-5-3'),
 ('P1-5-4', 'P1-5-2'),
 ('P1-4-2', 'P1-4-3'),
 ('P1-5-1', 'P1-4-2'),
 ('P1-5-4', 'P1-5-1'),
 ('P1-6-4', 'P1-5-4'),
 ('P1-8-1', 'P1-6-4'),
 ('P1-3-2', 'P1-3-3'),
 ('P1-3-1', 'P1-3-2'),
 ('P1-3-4', 'P1-3-1'),
 ('P1-4-1', 'P1-3-4'),
 ('P1-1-2', 'P1-1-3'),
 ('P1-1-1', 'P1-1-2'),
 ('P1-1-4', 'P1-1-1'),
 ('P1-2-2', 'P1-2-3'),
 ('P1-2-1', 'P1-2-2'),
 ('P1-2-4', 'P1-2-1'),
 ('P1-1-4', 'P1-2-4'),
 ('P1-4-1', 'P1-1-4'),
 ('P1-8-1', 'P1-4-1')]

In [47]:
import numpy as np
note_ids = list(np.unique([e.attrib['id'] for e in root.findall('''.//note''')]))

In [43]:
spacy_words = [{"text": word, "tag": ""} for word in note_ids]
arcs = []
for start_id, end_id in arcs:
    start_ix = note_ids.index(start_id)
    end_ix = note_ids.index(end_id)
    if start_ix < end_ix:
        arcs.append({"start": start_ix, "end": end_ix, "label": "", "dir": "right"})
    else:
        arcs.append({"start": end_ix, "end": start_ix, "label": "", "dir": "left"})


spacy_dict = {
    "words": [
        {"text": "This", "tag": ""},
        {"text": "is", "tag": ""},
        {"text": "a", "tag": ""},
        {"text": "sentence", "tag": ""}
    ],
    "arcs": [
        {"start": 0, "end": 1, "label": "", "dir": "right"},
        {"start": 2, "end": 3, "label": "", "dir": "left"},
        {"start": 1, "end": 3, "label": "", "dir": "right"}
    ]
}

In [18]:
import spacy
from spacy import displacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

sentence = 'Deemed universities charge huge fees'

# nlp function returns an object with individual token information, 
# linguistic features and relationships
doc = nlp(sentence)

print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relation','Head', 'Children'))
print ("-" * 70)

for token in doc:
  # Print the token, dependency nature, head and all dependents of the token
  print ("{:<15} | {:<8} | {:<15} | {:<20}"
         .format(str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))
  
# Use displayCy to visualize the dependency 
displacy.render(doc, style='dep', jupyter=True, options={'distance': 120})

Token           | Relation | Head            | Children            
----------------------------------------------------------------------
Deemed          | amod     | universities    | []                  
universities    | nsubj    | charge          | [Deemed]            
charge          | ROOT     | charge          | [universities, fees]
huge            | amod     | fees            | []                  
fees            | dobj     | charge          | [huge]              


In [22]:
l = []
for token in doc:
    l.append(token)
type(doc)

spacy.tokens.doc.Doc

In [31]:
obj = {
    "words": [
        {"text": "This", "tag": ""},
        {"text": "is", "tag": ""},
        {"text": "a", "tag": ""},
        {"text": "sentence", "tag": ""}
    ],
    "arcs": [
        {"start": 0, "end": 1, "label": "", "dir": "right"},
        {"start": 2, "end": 3, "label": "", "dir": "left"},
        {"start": 1, "end": 3, "label": "", "dir": "right"}
    ]
}

displacy.render(obj, style='dep', jupyter=True, options={'distance': 120}, manual=True)