Skip to content

Commit

Permalink
Old fix for tei:colloc formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
humenda committed Nov 21, 2017
1 parent fdd9c4b commit 0a125a0
Show file tree
Hide file tree
Showing 2 changed files with 242 additions and 1 deletion.
241 changes: 241 additions & 0 deletions buildhelpers/changelog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#!/usr/bin/env python3
"""Releasing a bunch of dictionaries can be tedious because of the change log
editing, copyright adjusting and more. This script helps to automate this
process as much as possible.
Please see the usage from parse_args() for more details."""

# To Do: the classes in this file are a *fork* of rm_duplicates.py. We
# ought to find a way to properly ship this as a module.

import datetime
import io
import os
import re
import shutil
import sys
import xml.etree.ElementTree as ET

# TEI name space, LXML's parser doesn't handle them nicely
TEI_NS = '{http://www.tei-c.org/ns/1.0}'
# findall/iter with TEI namespace removed
findall = lambda x,y: x.findall(TEI_NS + y)
tei_iter = lambda x,y: x.iter(TEI_NS + y)


class CommentedTreeBuilder(ET.TreeBuilder):
"""A TreeBuilder subclass that retains XML comments from the source. It can
be treated as a black box saving the contents before and after the root tag,
so that it can be re-added when writing back a XML ElementTree to disk. This
is necessary because of lxml/ElementTree's inability to handle declarations
nicely."""
def comment(self, data):
self.start(ET.Comment, {})
self.data(data)
self.end(ET.Comment)



#pylint: disable=too-few-public-methods
class XmlParserWrapper:
"""This thin wrapper guards the parsing process. It manually finds the TEI
element and copies everything before and afterwards *verbatim*. This is due
to the inability of the ElementTree parser to handle multiple "root
elements", for instance comments before or after the root node or '<!'
declarations.
"""
def __init__(self, file_name):
with open(file_name, encoding='utf-8') as file:
content = file.read()
if not any(u in content for u in ('utf-8', 'utf8', 'UTF8', 'UTF-8')):
raise ValueError("XML file is not encoded in UTF-8. Please recode "
"the file or extend this parser and XML writer.")
tei_start = content.find('<TEI')
if tei_start < 0:
raise ValueError("Couldn't find string `<TEI` in the XML file. Please extend this parser.")
self.before_root = content[:tei_start]
content = content[tei_start:]
tei_end = content.find('</TEI>')
if tei_end < 0:
raise ValueError("Couldn't find `</TEI>` in the input file, please extend the parser.")
tei_end += len('</TEI>')
self.after_root = content[tei_end:]
content = content[:tei_end]
parser = ET.XMLParser(target = CommentedTreeBuilder())
parser.feed(content)
self.root = parser.close()

def write(self, file_name):
"""Write the XML element tree to a file, with hopefully a very similar
formatting as before."""
tree = ET.ElementTree(self.root)
in_mem = io.BytesIO()
tree.write(in_mem, encoding="UTF-8")
in_mem.seek(0)
with open(file_name, 'wb') as file:
file.write(self.before_root.encode('UTF-8'))
file.write(in_mem.read())
file.write(self.after_root.encode('UTF-8'))
if not self.after_root.endswith('\n'):
file.write(b'\n')

def get_spacing(element):
"""This function returns a tuple with the mnemonics (linebreak, indentation)
of this element. If a the node is indented with four spaces, the tuple will
be ('\\n', ' ') and if there's no whitespace between the nodes, it'll be
('', '')."""
spaces = re.search('^\\n*( +|\\t+)', element.text)
return (('\n' if element.text.startswith('\n') else ''),
(spaces.groups()[0] if spaces else ''))

def get_editor():
"""Detect an editor to use. Try to use $EDITOR or probe for a bunch of
more common ones."""
editor = None
if 'EDITOR' in os.environ:
editor = os.environ['EDITOR']
if not editor:
for name in ('vim', 'vi', 'nvim', 'emacs', 'nano', 'edit', 'notepad',
'gedit', 'pluma', 'kate'): # a few more popular ones
if shutil.which(name):
editor = name
break
if not editor:
print(("Failed to detect an editor. Please set the EDITOR environment "
"variable to a program in your PATH."))
sys.exit(1)
else:
return editor


def add_changelog_entry(revision_desc, edition, date, username, author=None):
"""Try to detect an text editor, open it and add the written content to a
new change tag within the supplied revision_desc."""
editor = get_editor()
fn = 'changelog.tmp'
with open(fn, 'w', encoding='UTF-8') as f:
f.write("""\n
# Please enter your change notes as you would enter them in a </change> tag in
# the revisionDesc tag of a TEI header. Empty lines will be ignored and lines
# starting with a hash `#` will be ignored.
# Valid formatting include either plain text or lists like
# `<list><item>blah</item></list>`. Please note that you need to take care of
# escaping yourself.
""")
ret = os.system('%s %s' % (editor, fn))
if ret:
print("Error while starting", editor)
sys.exit(2)
with open(fn, encoding='UTF-8') as f:
data = '\n'.join(l.rstrip() for l in f
if l.strip() and not l.lstrip().startswith('#')) + '\n'
os.remove(fn)
change = ET.Element('change')
change.attrib['when'] = date
change.attrib['who'] = username
change.attrib['n'] = edition
if author:
author_node = ET.Element('name')
author_node.text = author
change.append(author)
lbreak, spaces = get_spacing(revision_desc)
try:
child = ET.fromstring(data)
change.text = lbreak + spaces
change.append(child)
except ET.ParseError: # treat it as plain text
change.text = data + spaces
change.tail = lbreak + spaces
revision_desc.insert(0, change)

def update_date(root, date):
"""Find publicationStmt/date, update it."""
try:
publication = next(tei_iter(root, 'publicationStmt'))
except StopIteration:
print("File lacks a publicationStmt node, possibly malformed.")
sys.exit(3)
try:
date_node = next(tei_iter(publication, 'date'))
except StopIteration:
return
date_node.attrib['when'] = date
date_node.text = datetime.datetime.strptime(date, '%Y-%m-%d').strftime('%b %d, %Y')

def update_extent(root):
try:
extent = next(tei_iter(root, 'extent'))
except StopIteration:
print("Could not find extent tag, possibly malformed file.")
sys.exit(3)
extent.text = '%i headwords' % sum(1 for _ in tei_iter(root, 'entry'))


def update_copyright(root):
"""Find a stanza containing "(c) 2014-2017 xyz" or "© 2020 foo" to update
the year."""
try:
availability = next(tei_iter(root, 'availability'))
except StopIteration:
print("Unable to find availability tag, possibly malformed TEI file.")
sys.exit(3)
copyright = re.compile(r'(©|\([cC]\))\s*([0-9]{4})(?:-)([0-9]{4})')
for tag in availability.iter():
match = copyright.search(tag.text)
if not match:
continue
start, end = match.span()
year = match.groups()[1]
if match.groups()[2]:
year = match.groups()[2]
tag.text = tag.text[:start] + tag.text[start:end].replace(year,
datetime.datetime.now().strftime('%Y')) + \
tag.text[end:]
return # only update first copyright stanza, hopefully most current one

def parse_args():
"""Custom argument parser, parsing command lines like './foo E=bar baz`
(optional values are key-value pairs)."""
def usage(msg):
print(msg)
print("Usage: %s [E=<EDITION>] <FILE>" % sys.argv[0])
print("\nThis script assists in changing a TEI header for release. It does the following:")
print("- update the edition")
print("- updates the release date")
print("- adds a new changelog entry")
print("- update the year in the availability / copyright section, if appropriate")
print("\nExample: %s E=1.5.2 lat-deu.tei" % sys.argv[0])
sys.exit(0)
if len(sys.argv) < 2 or (sys.argv[1] in ('-h', '--help')):
usage("Error, at least two arguments expected.")
optional = dict(a.split('=') for a in sys.argv[1:] if '=' in a)
mandatory = list(a for a in sys.argv[1:] if '=' not in a)

known_keys = ['E']
invalid = tuple(k for k in optional if k not in known_keys)
if invalid:
usage("Illegal arguments: %s" % repr(invalid))
if len(mandatory) != 1:
usage("Error, exactly one input file required.")
return (optional, mandatory)


def main():
# register TEI name space without prefix to dump the *same* XML file
ET.register_namespace('', 'http://www.tei-c.org/ns/1.0')
# ToDo: date, username, author, edition
opts, mandatory = parse_args()
isodate = datetime.datetime.today().strftime('%Y-%m-%d')
username = 'humenda'
tree = XmlParserWrapper(mandatory[0])
try:
revision_desc = next(tei_iter(tree.root, 'revisionDesc'))
except StopIteration:
print("Error, could not find revisionDesc.")
sys.exit(20)
add_changelog_entry(revision_desc, opts['E'], isodate, username)
update_date(tree.root, isodate)
update_copyright(tree.root)
update_extent(tree.root)
tree.write(mandatory[0])

main()
2 changes: 1 addition & 1 deletion xsl/inc/teientry2txt.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
it's visually something distinct in most dictionaries -->
<xsl:if test="tei:colloc">
<xsl:text> (</xsl:text>
<xsl:value-of select="."/>
<xsl:value-of select="tei:colloc"/>
<xsl:text>)</xsl:text>
</xsl:if>

Expand Down

0 comments on commit 0a125a0

Please sign in to comment.