/
verse_stats.py
executable file
·195 lines (162 loc) · 7.18 KB
/
verse_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
"""
Extracts per-verse information from the edition and writes it to a CSV file.
This script creates a CSV file with a row for each 'line' of the edition. A 'line' is, essentially,
anything that may have an apparatus: A verse, a part of an antilabial verse, a stage direction,
a speaker name etc. The CSV file contains the number of variants and witnesses (from the text view),
the number of relevant paralipomena (from the genetic bargraph) and a few data directly extracted from
the TEI file. See the source code of the class Verse for details.
"""
from __future__ import annotations
import csv
import gzip
import json
import sys
from argparse import ArgumentParser
from collections import defaultdict
from dataclasses import dataclass, fields, asdict
from os import fspath
from pathlib import Path
from typing import Optional, Iterable
from urllib.request import urlopen
from lxml import etree
from tqdm import tqdm
_ns = {'tei': 'http://www.tei-c.org/ns/1.0',
'xh': 'http://www.w3.org/1999/html'}
@dataclass
class Verse:
"""Represents a single line. Directly maps to the CSV file."""
n: str # line id (source: @n). 1-12111 for verses, something like before_1178_b for paratext.
variants: int # number of variants for this line.
witnesses: int # number of witnesses that have this line.
paralipomena: int # number of witnesses with paralipomena that are related to this line (only for verses!)
paralipomena_uncertain: int # ditto, but uncertain relationship
speaker: Optional[str] # speaker of the line, if inside a speech act (tei:sp/tei:speaker)
element: str # local name of the TEI element representing the line (e.g., l or stage)
is_text: bool # True iff it’s main text
section: str # innermost section number (e.g., 2.3.1 for Faust II, 3rd act, first scene)
lg: str # if line is inside <lg>, n of the respective lg’s first verse
text: str # plain text contents of the line
class VerseStats:
DEFAULT_URL = "http://faustedition.net/"
loaded = False
html_location = 'print/faust.all.html'
bargraph_location = 'data/genetic_bar_graph.json'
xml_location = 'downloads/faust.xml'
def __init__(self, edition: Optional[str]):
if edition is None:
# try to find build dir
build_dir: Path = Path(sys.argv[0]).resolve().parent.parent / "build/www"
if build_dir.is_dir():
self.edition = build_dir
self.from_web = False
else:
self.edition = self.DEFAULT_URL
self.from_web = True
elif Path(edition).is_dir():
self.edition = Path(edition)
self.from_web = False
else:
if edition[-1] != '/':
edition += '/'
self.edition = edition
self.from_web = True
def load(self):
if self.from_web:
self.html = etree.parse(self.edition + self.html_location)
self.tei = etree.parse(self.edition + self.xml_location)
with urlopen(self.edition + self.bargraph_location) as response:
self.bargraph = parse_bargraph_info(json.load(response))
else:
self.html = etree.parse(fspath(self.edition / self.html_location))
self.tei = etree.parse(fspath(self.edition / self.xml_location))
with (self.edition / self.bargraph_location).open() as f:
self.bargraph = parse_bargraph_info(json.load(f))
# remove critical apparatus from TEI version
for note in self.tei.xpath('//tei:note[@type="textcrit"]', namespaces=_ns):
note.getparent().remove(note)
# build HTML cache for speedup
html_lines: dict[str, etree._Element] = {}
for el in self.html.xpath('//*[@data-n]', namespaces=_ns):
n = el.get('data-n')
if n not in html_lines:
html_lines[n] = el
self.html_lines = html_lines
self.loaded = True
def lines(self):
if not self.loaded:
self.load()
for el_t in self.tei.xpath('//*[@n][not(self::tei:div)]', namespaces=_ns):
n = el_t.get('n')
n_h = n[:-1] if n[-1] in 'imf' and n[-2] != '_' else n # antilabial n's are contracted in html
el_h = self.html_lines[n_h] #self.html.xpath(f'//*[@data-n="{n_h}"]', namespaces=_ns)[0]
variants = int(el_h.get('data-variants'))
witnesses = int(el_h.get('data-varcount'))
speaker = normalize_space(''.join(el_t.xpath('ancestor::tei:sp//tei:speaker//text()', namespaces=_ns)))
v = Verse(n, variants, witnesses,
paralipomena=len(self.bargraph[n]['paralipomena']),
paralipomena_uncertain=len(self.bargraph[n]['paralipomena_uncertain']),
speaker=speaker,
element=el_t.tag.split('}')[-1],
text=normalize_space(''.join(el_t.xpath('.//text()', namespaces=_ns))),
is_text=n.isnumeric() or n.startswith('ttf_'),
lg=first(el_t.xpath('ancestor::tei:lg[1]/tei:l[@n][1]/@n', namespaces=_ns)),
section=first(el_t.xpath('ancestor::tei:div[1]/@n', namespaces=_ns))
)
yield v
def first(it: Iterable, default=None):
try:
return next(iter(it))
except StopIteration:
return default
def normalize_space(s: str, ignore_missing=True):
if ignore_missing and s is None:
return None
return " ".join(s.split())
def parse_bargraph_info(data) -> dict[int, dict[str, set[str]]]:
"""
Reads and reorders the bargraph json.
Parameters
----------
fn: Path to the bargraph json file
Returns
-------
Dictionary verse no -> type -> set of sigils
"""
verses = defaultdict(lambda: defaultdict(set))
for doc in data:
sigil = doc['sigil']
for interval in doc['intervals']:
kind = interval['type']
for n in range(interval['start'], interval['end'] + 1):
verses[str(n)][kind].add(sigil)
return verses
def getargparser():
p = ArgumentParser(description=__doc__)
p.add_argument('edition', nargs='?',
help='URL or path to the edition. If missing, try to find the build dir and fall back to the released edition.')
p.add_argument('-o', '--output', type=Path,
help='output file (csv or csv.gz). if missing, write to stdout.')
return p
def main():
options = getargparser().parse_args()
vs = VerseStats(options.edition)
print(f'Loading from {vs.edition} ...')
vs.load()
if options.output:
if '.gz' in options.output.suffixes:
output_file = gzip.open(options.output, 'wt')
else:
output_file = open(options.output, 'wt')
else:
output_file = sys.stdout
try:
writer = csv.DictWriter(output_file, list(field.name for field in fields(Verse)))
writer.writeheader()
for verse in tqdm(vs.lines(), total=15200, desc='Analyzing'):
writer.writerow(asdict(verse))
finally:
if output_file != sys.stdout:
output_file.close()
if __name__ == '__main__':
main()