-
Notifications
You must be signed in to change notification settings - Fork 0
/
verse_stats.py
108 lines (84 loc) · 3.07 KB
/
verse_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from __future__ import annotations
import csv
import json
import sys
from collections import defaultdict
from dataclasses import dataclass, fields, asdict
from typing import Optional, Iterable
from lxml import etree
from pathlib import Path
from os import fspath
from lxml.etree import tostring
from tqdm import tqdm
_ns = {'tei': 'http://www.tei-c.org/ns/1.0',
'xh': 'http://www.w3.org/1999/html'}
def first(it: Iterable, default=None):
try:
return next(iter(it))
except StopIteration:
return default
def normalize_space(s: str, ignore_missing=True):
if ignore_missing and s is None:
return None
return " ".join(s.split())
def read_bargraph_info(fn: Path) -> dict[int, dict[str, set[str]]]:
"""
Reads and reorders the bargraph json.
Parameters
----------
fn: Path to the bargraph json file
Returns
-------
Dictionary verse no -> type -> set of sigils
"""
with fn.open() as f:
data = json.load(f)
verses = defaultdict(lambda: defaultdict(set))
for doc in data:
sigil = doc['sigil']
for interval in doc['intervals']:
kind = interval['type']
for n in range(interval['start'], interval['end'] + 1):
verses[str(n)][kind].add(sigil)
return verses
@dataclass
class Verse:
n: str
variants: int
witnesses: int
paralipomena: int
paralipomena_uncertain: int
speaker: Optional[str]
element: str
is_text: bool
text: str
class VerseStats:
def __init__(self, build_dir: Path | str):
self.build_dir = Path(build_dir)
self.html = etree.parse(fspath(self.build_dir / 'www/print/faust.all.html'))
self.tei = etree.parse(fspath(self.build_dir / 'prepared/textTranscript/faust.xml'))
self.bargraph = read_bargraph_info(self.build_dir / 'www/data/genetic_bar_graph.json')
def lines(self):
for el_h in self.html.xpath('//*[@data-varcount]', namespaces=_ns):
n = el_h.get('data-n')
el_t = self.tei.xpath(f'//*[@n="{n}"]', namespaces=_ns)[0]
variants = int(el_h.get('data-variants'))
witnesses = int(el_h.get('data-varcount'))
speaker = normalize_space(
first(el_t.xpath('ancestor::tei:sp//tei:speaker/text()', namespaces=_ns), default=None))
v = Verse(n, variants, witnesses,
paralipomena=len(self.bargraph[n]['paralipomena']),
paralipomena_uncertain=len(self.bargraph[n]['paralipomena_uncertain']),
speaker=speaker,
element=el_t.tag.split('}')[-1],
text=normalize_space(''.join(el_t.xpath('.//text()[not(ancestor::tei:note)]', namespaces=_ns))),
is_text = n.isnumeric() or n.startswith('ttf_'))
yield v
def main():
vs = VerseStats('../build')
writer = csv.DictWriter(sys.stdout, list(field.name for field in fields(Verse)))
writer.writeheader()
for verse in vs.lines():
writer.writerow(asdict(verse))
if __name__ == '__main__':
main()