# Working with BatchXSLT

This is a tutorial for the Python-Wrapper batchxslt. This package is specified to transform language resources of the dgd metafile xml format to the clarin medata format by using xsl stylesheets and saxon. I wrote this wrapper to make corpus transformation less painful and more extensible. Though processing with lxml is easy, I decided keep transformation to the xlst processor and management of it to batchxslt so users not familiar with lxml can stick to xslt and rely on this tutorial for their needs.


## 1. Transform metadata from dgd to cmdi format

* define resource locators (corpus, event, speakers directories and saxon and xsl directories)
* define an output directory 

In [1]:
# using absolute paths 
corpus_dir = "/home/kuhn/Data/IDS/svn_rev1233/dgd2_data/metadata/corpora/extern"
event_dir = "/home/kuhn/Data/IDS/svn_rev1233/dgd2_data/metadata/events/extern"
speakers_dir = "/home/kuhn/Data/IDS/svn_rev1233/dgd2_data/metadata/speakers/extern"
xsl_dir = "/home/kuhn/Data/IDS/svn/dgd2_data/dgd2cmdi/xslt/"
corpus_xsl = xsl_dir + "dgdCorpus2cmdi.xsl"
event_xsl = xsl_dir + "dgdEvent2cmdi.xsl"
speaker_xsl = xsl_dir + "dgdSpeaker2cmdi.xsl"
saxon_jar = "/home/kuhn/Data/IDS/svn/dgd2_data/dgd2cmdi/batchxslt/saxon/saxon9he.jar"
out_corp = "/tmp/cmdi2/corpus"
out_event = "/tmp/cmdi2/events"
out_speaker = "/tmp/cmdi2/speakers"

In [2]:
# import the xsl wrapper
from batchxslt import processor

In [3]:
xsl_processor = processor.XSLBatchProcessor(saxon_jar)

In [4]:
# show me the api doc of the start method
xsl_processor.start?

In [4]:
%time xsl_processor.start(corpus_xsl, corpus_dir, "cmdi_", out_corp)

stylesheet: /home/kuhn/Data/IDS/svn/dgd2_data/dgd2cmdi/xslt/dgdCorpus2cmdi.xsl
outputdir not readable: /tmp/cmdi2/corpus
xmldata: /home/kuhn/Data/IDS/svn_rev1233/dgd2_data/metadata/corpora/extern
CPU times: user 0 ns, sys: 19.7 ms, total: 19.7 ms
Wall time: 15.5 s


In [5]:
%time xsl_processor.start(event_xsl, event_dir, "cmdi_", out_event)

stylesheet: /home/kuhn/Data/IDS/svn/dgd2_data/dgd2cmdi/xslt/dgdEvent2cmdi.xsl
outputdir not readable: /tmp/cmdi2/events
xmldata: /home/kuhn/Data/IDS/svn_rev1233/dgd2_data/metadata/events/extern
cannot create directory /tmp/cmdi2/events/SR
Maybe it already exists...
CPU times: user 516 ms, sys: 5.15 s, total: 5.67 s
Wall time: 2h 29min 58s


In [6]:
%time xsl_processor.start(speaker_xsl, speakers_dir, "cmdi_", out_speaker)

stylesheet: /home/kuhn/Data/IDS/svn/dgd2_data/dgd2cmdi/xslt/dgdSpeaker2cmdi.xsl
outputdir not readable: /tmp/cmdi2/speakers
xmldata: /home/kuhn/Data/IDS/svn_rev1233/dgd2_data/metadata/speakers/extern
cannot create directory /tmp/cmdi2/speakers/SR
Maybe it already exists...
CPU times: user 607 ms, sys: 6.13 s, total: 6.74 s
Wall time: 2h 16min 53s


## 2. Defining Resource Proxies

Once the original metadata files have been transformed to cmdi, we can go on to build up a resource tree structure of it. For this purpose, we can use the module **cmdiresource**.  

In [3]:
from batchxslt import cmdiresource
import os
import logging

We define the paths to our recently transformed data

In [4]:
corpus = "/tmp/cmdi2/corpus/"
event = "/tmp/cmdi2/events/"
speakers = "/tmp/cmdi2/speakers/"
transcripts = "/home/kuhn/Data/IDS/svn_rev1233/dgd2_data/transcripts/"

cmdi_final = '/home/kuhn/Data/IDS/cmdi2/'

Now define a ResourceTreeCollection instance.

In [5]:
resourcetree = cmdiresource.ResourceTreeCollection(corpus, event, speakers, transcripts)

In [4]:
counter = 0
for node in resourcetree.nodes_iter():
    corpuslabel = node.split('_')[0].rstrip('-')
    resourcetree.node.get(node).update({'id': corpuslabel + '_' + str(counter)})
    counter += 1

ResourceTreeCollection inherits from networx.DiGraph and builds up a resource tree for all resources of the dgd2.
Lets look at a random resource node.

In [6]:
resourcetree.build_resourceproxy()

AttributeError: 'NoneType' object has no attribute 'getroot'

In [20]:
resourcetree.node.get()

{'corpusroot': False,
 'etreeobject': False,
 'filename': '0',
 'id': '0_5298',
 'repopath': None,
 'type': 'audio'}

In [5]:
for nodename in resourcetree.nodes_iter():
    if resourcetree.node.get(nodename).get('type') == 'event':
        resourcetree.define_parts(nodename)
    elif resourcetree.node.get(nodename).get('type') == 'corpus':
        resourcetree.define_parts(nodename)

In [6]:
for nodename in resourcetree.nodes_iter():
    if resourcetree.node.get(nodename).get('type') == 'speaker':
        resourcetree.speaker2event(nodename)

In [None]:
for nodename in resourcetree.nodes_iter():


    if resourcetree.node.get(nodename).get('type') == 'event':

        cmdiheader.define_header(nodename, resourcetree)
        corpus = nodename.split('_')[0].rstrip('-')
        try:
            os.mkdir(os.path.abspath(cmdi_final + '/' + corpus))
        except:
            logging.error('cannot create directory: ' + corpus)
        resourcetree.write_cmdi(nodename, os.path.join(cmdi_final, corpus + '/' + nodename + '.cmdi'))
        
        
    elif resourcetree.node.get(nodename).get('type') == 'corpus':

        cmdiheader.define_header(nodename, resourcetree)
        corpus = nodename.split('_')[0].rstrip('-')
        try:
            os.mkdir(os.path.abspath(cmdi_final + '/' + corpus))
        except:
            logging.error('cannot create directory: ' + corpus)
        resourcetree.write_cmdi(nodename, os.path.join(cmdi_final, corpus + '/' + nodename + '.cmdi'))

[('FOLK', 'FOLK_E_00001')]

[('FOLK_E_00001', 'FOLK_E_00001_SE_01_T_01_DF_01'),
 ('FOLK_E_00001', 'FOLK_E_00001_SE_01_T_02_DF_01'),
 ('FOLK_E_00001', 'FOLK_S_00001'),
 ('FOLK_E_00001', 'FOLK_S_00003'),
 ('FOLK_E_00001', 'FOLK_S_00002'),
 ('FOLK_E_00001', 'FOLK_S_00005'),
 ('FOLK_E_00001', 'FOLK_S_00004'),
 ('FOLK_E_00001', 'FOLK_S_00007'),
 ('FOLK_E_00001', 'FOLK_S_00006'),
 ('FOLK_E_00001', 'FOLK_S_00023'),
 ('FOLK_E_00001', 'FOLK_S_00008'),
 ('FOLK_E_00001', 'FOLK_S_00021'),
 ('FOLK_E_00001', 'FOLK_S_00020'),
 ('FOLK_E_00001', 'FOLK_S_00026'),
 ('FOLK_E_00001', 'FOLK_S_00024'),
 ('FOLK_E_00001', 'FOLK_S_00009'),
 ('FOLK_E_00001', 'FOLK_S_00022'),
 ('FOLK_E_00001', 'FOLK_S_00016'),
 ('FOLK_E_00001', 'FOLK_S_00017'),
 ('FOLK_E_00001', 'FOLK_S_00014'),
 ('FOLK_E_00001', 'FOLK_S_00015'),
 ('FOLK_E_00001', 'FOLK_S_00012'),
 ('FOLK_E_00001', 'FOLK_S_00013'),
 ('FOLK_E_00001', 'FOLK_S_00010'),
 ('FOLK_E_00001', 'FOLK_S_00011'),
 ('FOLK_E_00001', 'FOLK_S_00018'),
 ('FOLK_E_00001', 'FOLK_S_00019')]

In [5]:
resourcetree.find_speakers('FOLK_E_00022')



['FOLK_S_00046',
 'FOLK_S_00047',
 'FOLK_S_00048',
 'FOLK_S_00050',
 'FOLK_S_00051']

In [18]:
# now build up the resource proxy tree and put it into the etree 

In [21]:
resourcetree.build_resourceproxy()

In [22]:
resourcetree[9]

KeyError: 9

In [24]:
resource = [i for i in resourcetree.node.iteritems()]

In [25]:
resource[2][1].get("etreeobject")

<lxml.etree._ElementTree at 0x7f451f710b90>

In [24]:
x = [i for i in resourcetree.nodes_iter()]

In [25]:
x[0]

'ZW--_S_02073'

In [6]:
resourcetree.out_edges('FOLK_E_00001')

[('FOLK_E_00001', 'FOLK_E_00001_SE_01_T_01_DF_01'),
 ('FOLK_E_00001', 'FOLK_E_00001_SE_01_T_02_DF_01'),
 ('FOLK_E_00001', 'FOLK_S_00001'),
 ('FOLK_E_00001', 'FOLK_S_00003'),
 ('FOLK_E_00001', 'FOLK_S_00002'),
 ('FOLK_E_00001', 'FOLK_S_00005'),
 ('FOLK_E_00001', 'FOLK_S_00004'),
 ('FOLK_E_00001', 'FOLK_S_00007'),
 ('FOLK_E_00001', 'FOLK_S_00006'),
 ('FOLK_E_00001', 'FOLK_S_00009'),
 ('FOLK_E_00001', 'FOLK_S_00008'),
 ('FOLK_E_00001', 'FOLK_S_00021'),
 ('FOLK_E_00001', 'FOLK_S_00020'),
 ('FOLK_E_00001', 'FOLK_E_00001_SE_01_A_01_DF_01'),
 ('FOLK_E_00001', 'FOLK_S_00024'),
 ('FOLK_E_00001', 'FOLK_S_00023'),
 ('FOLK_E_00001', 'FOLK_S_00022'),
 ('FOLK_E_00001', 'FOLK_S_00026'),
 ('FOLK_E_00001', 'FOLK_S_00016'),
 ('FOLK_E_00001', 'FOLK_S_00017'),
 ('FOLK_E_00001', 'FOLK_S_00014'),
 ('FOLK_E_00001', 'FOLK_S_00015'),
 ('FOLK_E_00001', 'FOLK_S_00012'),
 ('FOLK_E_00001', 'FOLK_S_00013'),
 ('FOLK_E_00001', 'FOLK_S_00010'),
 ('FOLK_E_00001', 'FOLK_S_00011'),
 ('FOLK_E_00001', 'FOLK_S_00018'),
 ('F

In [28]:
resourcetree.in_edges('FOLK_S_00001')

[('FOLK_E_00009', 'FOLK_S_00001'),
 ('FOLK_E_00008', 'FOLK_S_00001'),
 ('FOLK_E_00005', 'FOLK_S_00001'),
 ('FOLK_E_00001', 'FOLK_S_00001'),
 ('FOLK_E_00007', 'FOLK_S_00001'),
 ('FOLK_E_00006', 'FOLK_S_00001'),
 ('FOLK', 'FOLK_S_00001'),
 ('FOLK_E_00004', 'FOLK_S_00001')]

In [31]:
in_edges = [i[0] for i in resourcetree.in_edges('FOLK_S_00001')]

In [32]:
in_edges

['FOLK_E_00009',
 'FOLK_E_00008',
 'FOLK_E_00005',
 'FOLK_E_00001',
 'FOLK_E_00007',
 'FOLK_E_00006',
 'FOLK',
 'FOLK_E_00004']

In [33]:
out_edges = [i[1] for i in resourcetree.out_edges('FOLK_S_00001')]

In [35]:
out_edges

[]

In [36]:
in_edges

['FOLK_E_00009',
 'FOLK_E_00008',
 'FOLK_E_00005',
 'FOLK_E_00001',
 'FOLK_E_00007',
 'FOLK_E_00006',
 'FOLK',
 'FOLK_E_00004']

In [37]:
in_edges.extend(out_edges)

In [38]:
in_edges

['FOLK_E_00009',
 'FOLK_E_00008',
 'FOLK_E_00005',
 'FOLK_E_00001',
 'FOLK_E_00007',
 'FOLK_E_00006',
 'FOLK',
 'FOLK_E_00004']

In [39]:
set(in_edges)

{'FOLK',
 'FOLK_E_00001',
 'FOLK_E_00004',
 'FOLK_E_00005',
 'FOLK_E_00006',
 'FOLK_E_00007',
 'FOLK_E_00008',
 'FOLK_E_00009'}

In [40]:
out_edges.extend(in_edges)

In [41]:
out_edges

['FOLK_E_00009',
 'FOLK_E_00008',
 'FOLK_E_00005',
 'FOLK_E_00001',
 'FOLK_E_00007',
 'FOLK_E_00006',
 'FOLK',
 'FOLK_E_00004']

In [45]:
resourcetree.nbunch_iter?

In [46]:
resourcetree.add_star?

In [48]:
resourcetree.adj?

In [5]:
tuples = list()
for i in resourcetree.nodes_iter():
        tuples.append(i)

In [6]:
len(tuples)

20777

In [7]:
tuples[1]

'ZW--_S_02072'

In [8]:
tuples[0]

'ZW--_S_02073'

In [5]:
resourcetree.build_resourceproxy()

TypeError: Argument must be bytes or unicode, got 'NoneType'

In [5]:
e = None

In [8]:
list(e)

TypeError: 'NoneType' object is not iterable

In [4]:
e = []

In [5]:
set(e)

set()

In [5]:
resourcetree.node.get('FOLK_E_00009').get('etreeobject')

<lxml.etree._ElementTree at 0x7f7427905a28>

In [8]:
notcount = 0
for node in resourcetree.nodes_iter():
    if node is None:
        notcount+=1

In [9]:
notcount

0

In [4]:
resourcetree.define_resourceproxy('FOLK_E_00009')

TypeError: Argument must be bytes or unicode, got 'tuple'

In [26]:
speakers = resourcetree.find_speakers('FOLK_E_00009')
events = resourcetree.find_events('FOLK_E_00009')

In [27]:
speakers + events

['FOLK_S_00001',
 'FOLK_S_00003',
 'FOLK_S_00004',
 'FOLK_S_00005',
 'FOLK_S_00006',
 'FOLK_S_00007',
 'FOLK_S_00008',
 'FOLK_S_00009',
 'FOLK_S_00010',
 'FOLK_S_00011',
 'FOLK_S_00012',
 'FOLK_S_00013',
 'FOLK_S_00014',
 'FOLK_S_00015',
 'FOLK_S_00016',
 'FOLK_S_00017',
 'FOLK_S_00018',
 'FOLK_S_00019',
 'FOLK_S_00020',
 'FOLK_S_00021',
 'FOLK_S_00022',
 'FOLK_S_00023',
 'FOLK_S_00024',
 'FOLK_S_00026']

In [12]:
events

[]

In [18]:
joint = list.extend?

In [37]:
joint = speakers + events

TypeError: cannot concatenate 'str' and 'list' objects

In [21]:
joint

['FOLK_S_00001',
 'FOLK_S_00003',
 'FOLK_S_00004',
 'FOLK_S_00005',
 'FOLK_S_00006',
 'FOLK_S_00007',
 'FOLK_S_00008',
 'FOLK_S_00009',
 'FOLK_S_00010',
 'FOLK_S_00011',
 'FOLK_S_00012',
 'FOLK_S_00013',
 'FOLK_S_00014',
 'FOLK_S_00015',
 'FOLK_S_00016',
 'FOLK_S_00017',
 'FOLK_S_00018',
 'FOLK_S_00019',
 'FOLK_S_00020',
 'FOLK_S_00021',
 'FOLK_S_00022',
 'FOLK_S_00023',
 'FOLK_S_00024',
 'FOLK_S_00026']

In [7]:
resourcetree.node.get('FOLK').get("filename")

'cmdi_FOLK_extern.xml'

In [None]:
resourcetree.