# Exporting PAGE XML files from a Transkribus collection

In [None]:
import json
import re

with open('./issue-parser-result.json') as issue_json:
    issue_desc = json.load(issue_json)

to_export_transkribus = issue_desc['document-id']

collectionId = re.search(r"\((\w+)\)", issue_desc['source-collection']).group(0) [1:-1]

print(to_export_transkribus)
print(collectionId)

## Setup

In [None]:
collId = collectionId
docId = to_export_transkribus

In [None]:
!pip install lxml_html_clean
!pip install lxml[html_clean]
!pip install requests-toolbelt

import requests
from requests_toolbelt.multipart.encoder import MultipartEncoder
import os
from lxml import etree
import json

In [None]:
if secretsPath:
    with open(secretsPath, 'r') as secretsFile:
        secrets = json.loads(secretsFile.read())
        for (k, v) in secrets.items():
            os.environ[k] = v

creds = json.loads(os.environ["TRANSKRIBUS_CREDENTIALS"])

s = requests.Session()
s.post('https://transkribus.eu/TrpServer/rest/auth/login', data=creds)

## Initialisation

<!-- does it make sense to get mets files, too? 

http://transkribus.eu/TrpServer/rest/collections/{collId}/{id}/mets

-->

In [None]:
docs = s.get('https://transkribus.eu/TrpServer/rest/collections/'+ str(collId) +'/'+ str(docId) +'/pages')
json = json.loads(docs.content)

transcript_urls = []

for page in json:
    tsList = page['tsList']
    most_recent_transcript = None
    latest_timestamp = 0

    for transcript in tsList['transcripts']:
        timestamp = transcript['timestamp']
        if timestamp > latest_timestamp:
            latest_timestamp = timestamp
            most_recent_transcript = transcript

#    transcript_urls.append(most_recent_transcript['url'])
        if most_recent_transcript:
            transcript_urls.append({
                'imgFileName': page['imgFileName'],
                'transcript_url': most_recent_transcript['url']
            })

transcript_urls

In [None]:
import requests
import xml.etree.ElementTree as ET

# transcript_urls defined above

xml_files = []

for url in transcript_urls:
  response = requests.get(url['transcript_url'])
  if response.status_code == 200:
    # fix image renaming by Trankribus; use our name instead of Transkribus internal name
    ET.register_namespace("", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15")
    xml_content = response.text
    root = ET.fromstring(xml_content)
    page_element = root.find('.//{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page')
    if page_element is not None:
      current_imageFilename = page_element.get('imageFilename')
      if current_imageFilename and current_imageFilename != url['imgFileName']:
        page_element.set('imageFilename', url['imgFileName'])
    processed_xml = ET.tostring(root, encoding='unicode')
    xml_files.append(processed_xml)

xml_files_all = ""
for xml_file in xml_files:
  xml_files_all += xml_file

validation_payload = ""
validation_payload += '<?xml version="1.0" encoding="UTF-8"?><?xml-model href="dse-as-transkribus.sch" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?>\n<validation-wrapper>' + xml_files_all + '</validation-wrapper>\n'

validation_payload

#validation_payload = '<?xml version="1.0" encoding="UTF-8"?>\n{0}'.format(validation_payload)

#validation_payload = ET.tostring(validation_payload, encoding="utf-8").decode("utf-8")

# Print or save the XML string
print(validation_payload)

with open("./validation_input.xml", "w", encoding="utf-8") as f:
    f.write(validation_payload)

# Set up pyschematron

In [None]:
!pip install pyschematron

from pathlib import Path

from lxml import etree

from pyschematron import DirectModeSchematronValidatorFactory, validate_document
from pyschematron.direct_mode.schematron.ast_visitors import ResolveExtendsVisitor, ResolveAbstractPatternsVisitor, PhaseSelectionVisitor
from pyschematron.direct_mode.schematron.parsers.xml.parser import SchemaParser, ParsingContext
from pyschematron.direct_mode.xml_validation.results.svrl_builder import DefaultSVRLReportBuilder
from pyschematron.direct_mode.xml_validation.validators import SimpleSchematronXMLValidator
from pyschematron.utils import load_xml_document

# Note that it is possible to include custom python functions (see pyschematron repo); however this led to buggy evaluation of is-valid() 
# for cases where the functions did not apply (or when they were deactivated)

In [None]:

# paths to the xml data and schema
schematron_base_path = Path('./')
schematron_schema_path = schematron_base_path / 'transkribus-export/dse-as-transkribus.sch'
xml_document_path = schematron_base_path / 'validation_input.xml'


def pyschematron_functional_interface():
    """This example uses the functional interface, the most simple method of interacting with PySchematron. """
    result = validate_document(xml_document_path, schematron_schema_path)
    svrl = result.get_svrl()

    report_str = etree.tostring(svrl, pretty_print=True).decode('utf-8')
    with open("./validation_output-svrl.xml", "w", encoding="utf-8") as f:
        f.write(report_str)
    validation_result = str(result.is_valid())
    with open("./validation_is_valid.txt", "w", encoding="utf-8") as f:
        f.write(validation_result)    
    print(report_str)
    print(result.is_valid())

pyschematron_functional_interface()


In [None]:
from lxml import etree

xml_data = etree.parse('./validation_output-svrl.xml')

xslt_str = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  xmlns:svrl="http://purl.oclc.org/dsdl/svrl">
  <xsl:output method="text" omit-xml-declaration="yes" indent="no"/>
  <xsl:template match="/">

    <xsl:text>&#xA;</xsl:text>
    <xsl:text>## Problems detected:&#xA;&#xA;</xsl:text>

    <xsl:apply-templates select="//svrl:failed-assert"/>
    <xsl:apply-templates select="//svrl:successful-report"/>

    <xsl:text>&#xA;**Try to resolve the problems in Transkribus and re-run the validation by replying to this issue or opening a new issue.**</xsl:text>

  </xsl:template>

  <xsl:template match="svrl:failed-assert|svrl:successful-report">
    <xsl:value-of select="concat('* ',svrl:text,'&#xA;')"/>
  </xsl:template>

</xsl:stylesheet>
"""

# Parse XML and XSLT data
xslt_data = etree.fromstring(bytes(xslt_str, encoding='utf-8'))

# Create transform object
transform = etree.XSLT(xslt_data)

# Apply transformation
result = transform(xml_data)

print(result)

with open("./issue-reply.txt", "w", encoding="utf-8") as f:
        f.write(str(result))