From f9d3d28d67460e8d16eb21d8e0613f9c68fcb9b4 Mon Sep 17 00:00:00 2001 From: Thorsten Vitt Date: Mon, 18 Feb 2019 10:48:20 +0100 Subject: [PATCH] Basic configuration setup --- logging.yaml | 2 +- src/macrogen/__init__.py | 4 +- src/macrogen/bibliography.py | 117 +++++++++++++++ src/macrogen/config.py | 270 ++++++++++++++++++++++++++++++++++ src/macrogen/convert_table.py | 6 + src/macrogen/datings.py | 184 +++++------------------ src/macrogen/etc/default.yaml | 24 +++ src/macrogen/graph.py | 34 ++--- src/macrogen/main.py | 5 +- src/macrogen/report.py | 12 +- src/macrogen/uris.py | 13 +- src/macrogen/visualize.py | 29 +++- tests/test_config.py | 22 +++ tests/test_main.py | 2 +- 14 files changed, 537 insertions(+), 187 deletions(-) create mode 100644 src/macrogen/bibliography.py create mode 100644 src/macrogen/config.py create mode 100644 src/macrogen/etc/default.yaml create mode 100644 tests/test_config.py diff --git a/logging.yaml b/logging.yaml index ee9adc8..0c71cea 100644 --- a/logging.yaml +++ b/logging.yaml @@ -13,7 +13,7 @@ handlers: root: handlers: - console - level: INFO + level: WARNING loggers: graph: diff --git a/src/macrogen/__init__.py b/src/macrogen/__init__.py index 3970e29..666cf4a 100644 --- a/src/macrogen/__init__.py +++ b/src/macrogen/__init__.py @@ -1,2 +1,4 @@ -from .graph import macrogenesis_graphs, BiblSource +from .graph import macrogenesis_graphs +from macrogen.bibliography import BiblSource from .uris import Reference, Witness, AmbiguousRef, Inscription +from .visualize import simplify_graph, write_dot, render_file, render_all \ No newline at end of file diff --git a/src/macrogen/bibliography.py b/src/macrogen/bibliography.py new file mode 100644 index 0000000..900a804 --- /dev/null +++ b/src/macrogen/bibliography.py @@ -0,0 +1,117 @@ +import csv +from collections import namedtuple, defaultdict +from typing import Dict, Union, IO + +import requests +from lxml import etree + +from .config import config +from . import faust + +BibEntry = namedtuple('BibEntry', ['uri', 'citation', 'reference', 'weight']) + + +def _parse_bibliography(bibxml: Union[str, IO]) -> Dict[str, BibEntry]: + """Parses the bibliography file at url. Returns a dictionary mapping an URI to a corresponding bibliography entry.""" + db: Dict[str, BibEntry] = {} + scores = config.bibscores + et = etree.parse(bibxml) + for bib in et.xpath('//f:bib', namespaces=faust.namespaces): + uri = bib.get('uri') + citation = bib.find('f:citation', namespaces=faust.namespaces).text + reference = bib.find('f:reference', namespaces=faust.namespaces).text + db[uri] = BibEntry(uri, citation, reference, scores[uri]) + return db + + +_bib_labels = { + 'faust://self': 'Faustedition', + 'faust://model/inscription': 'Inskription von', + 'faust://orphan/adoption': 'Datierungsinhalt für', + 'faust://heuristic': 'künstliche Datierung' +} + + +class BiblSource: + """ + A bibliographic source in a macrogenesis XML file. + """ + + def __init__(self, uri, detail=''): + """ + Creates a bibliographic source. + Args: + uri: should be a faust://bibliography/ URI or one of the special values + detail: detail string like pages + """ + self.uri = uri + if detail is None: + detail = '' + self.detail = detail + self.weight = config.bibliography[uri].weight if uri in config.bibliography else 1 + + def __eq__(self, other): + if isinstance(other, BiblSource): + return self.uri == other.uri and self.detail == other.detail + else: + return super().__eq__(other) + + def __hash__(self): + return hash(self.uri) ^ hash(self.detail) + + def __str__(self): + result = self.citation + if self.detail is not None: + result += '\n' + self.detail + return result + + @property + def filename(self): + """ + A string representation of the source (w/o detail) that is usable as part of a filename. + """ + if self.uri.startswith('faust://bibliography'): + return self.uri.replace('faust://bibliography/', '') + else: + return self.uri.replace('faust://', '').replace('/', '-') + + @property + def citation(self): + """ + String representation of only the citation, w/o detail. + + Example: + Bohnenkamp 19994 + """ + if self.uri in config.bibliography: + return config.bibliography[self.uri].citation + elif self.uri in _bib_labels: + return _bib_labels[self.uri] + else: + return self.filename + + @property + def long_citation(self): + if self.uri in config.bibliography: + return config.bibliography[self.uri].reference + else: + return self.citation + + +def read_scores(scorefile): + """ + Parses the bibliography score file. + + Returns: + Map uri -> score + + """ + scores = defaultdict(lambda: 1) + logger = config.getLogger(__name__) + r = csv.reader(scorefile, delimiter='\t') + for row in r: + try: + scores[row[0]] = int(row[1]) + except ValueError as e: + logger.warning('Skipping scorefile row %s: %s', row, e) + return scores diff --git a/src/macrogen/config.py b/src/macrogen/config.py new file mode 100644 index 0000000..5d9887b --- /dev/null +++ b/src/macrogen/config.py @@ -0,0 +1,270 @@ +""" +Configuration etc. + + +Configuration and data files: + +- logging.yaml +- styles.yaml +- reference-normalisation.csv +- bibscores.tsv +- sigils.json +- paralipomena.json +- genetic_bar_graph.json + +Additionally: + +- macrogenesis data +- output directory + +Optional: + +- graph file(s) to read from + +Additional stuff to configure: + +- Render / Render graphs up to ... +- algorithm / threshold +""" +import json +from io import BytesIO, StringIO +from os.path import expanduser, expandvars +from pathlib import Path +from typing import Optional, IO, Callable, Any, Tuple, Mapping +from urllib.parse import urlparse +from .bibliography import _parse_bibliography + +import pkg_resources +import requests +from lxml import etree +from ruamel.yaml import YAML + +import logging + +logger = logging.getLogger(__name__ + '.preliminary') + + +class CachedFile: + """Loads data from an URL, optionally caching it.""" + + def __init__(self, file_or_url: str, cache_dir: Optional[Path] = None): + """ + Creates a cacheing file loader. + + Args: + file_or_url: the url or path to the file to load + cache_dir: if present, a directory where to cache the file. If absent, don’t cache. + """ + url = urlparse(file_or_url) + + if url.scheme: + path = Path(url.path) + self.url = file_or_url + self.url_parsed = url + self.path = None + self.is_url = True + else: + path = Path(file_or_url) + self.url = None + self.path = path + self.is_url = False + + if self.is_url and cache_dir is not None: + self.path = cache_dir.joinpath(path.name) + + def open(self, offline=False, mode="rt") -> IO: + """ + Opens the file or URL. + + Args: + offline: Never access the internet. + mode: file mode for `open` + + Returns: + open IO, either to the cached file or to the remotely fetched content + """ + if self.is_url and not offline: + # fetch remote to cache + logger.debug('fetching %s', self.url) + response = requests.get(self.url) + + if self.path: + # dump to cache and serve from cache file + logger.debug('saving as %s', self.path) + if "b" in mode: + with self.path.open("wb") as cache_file: + cache_file.write(response.content) + else: + with self.path.open("wt", encoding='utf-8') as cache_file: + cache_file.write(response.text) + else: + if "b" in mode: + return BytesIO(response.content) + else: + return StringIO(response.text) + + return self.path.open(mode=mode, encoding='utf-8-sig') + + +class LazyConfigLoader: + """ + Descriptor that lazily loads stuff from configured paths. + """ + + def __init__(self, name: str, parser: Optional[Callable[[IO], Any]] = None, + fallback_resource=Optional[Tuple[str, str]]): + self.name = name + self.parser = parser + self.resource = fallback_resource + + def __get__(self, instance, owner): + if not hasattr(instance, '_data'): + instance._data = {} + if self.name not in instance.data: + self.load_data(instance) + return instance._data[self.name] + + def load_data(self, instance): + source = instance.config.get(self.name, None) + if source: + logger.info('Loading %s from %s', self.name, source) + cache = Path(instance.config.get('cache', '.cache')) + offline = instance.config.get('offline', False) + cached_file = CachedFile(source, cache) + with cached_file.open(offline) as file: + self.parse_data(file, instance) + elif self.resource: + logger.debug('Loading %s from internal configuration %s', self.name, self.resource) + with pkg_resources.resources_stream(*self.resource) as file: + self.parse_data(file, instance) + else: + raise ValueError( + f"Cannot access property {self.name}: Neither configured source nor fallback resource available") + + def parse_data(self, file, instance): + instance.data[self.name] = self.parser(file) if callable(self.parser) else file.read() + + +_yaml = YAML(typ='rt') +_cfg = 'macrogen' + + +class _Proxy: + + def __init__(self, constructor, *args, **kwargs): + self._constructor = constructor + self._args = args + self._kwargs = args + self._target = None + + def _init_proxy(self): + if self._target is None: + self._target = self._constructor(*self._args, **self._kwargs) + + def __getattr__(self, item): + self._init_proxy() + return getattr(self._target, item) + + def __setattr__(self, key, value): + self._init_proxy() + return setattr(self._target, key, value) + + def __delattr__(self, item): + self._init_proxy() + return delattr(self._target, item) + +class _Accessor: + + def __init__(self, accessor_function: Callable[[Any], Any]): + self._accessor = accessor_function + + def __getattr__(self, item): + return self._accessor(item) + + +class Configuration: + """ + Ready to use configuration data for the application. + + Data that is coming from files can be loaded lazily. + """ + + logging = LazyConfigLoader('logging', _yaml.load, (_cfg, 'etc/logging.yaml')) + styles = LazyConfigLoader('styles', _yaml.load, (_cfg, 'etc/styles.yaml')) + # reference-normalization.csv + # bibscores.tsv + sigils = LazyConfigLoader('sigils', json.load) + paralipomena = LazyConfigLoader('paralipomena', json.load) + genetic_bar_graph = LazyConfigLoader('bargraph', json.load) + bibliography = LazyConfigLoader('bibliography', _parse_bibliography) + + def __init__(self, config_override=None): + if config_override is None: + config_override = {} + self.config_override = config_override + + def get_path(key): + return Path(self.key) + + self.path = _Accessor(get_path) + + @property + def config_override(self): + return self._config_override + + @config_override.setter + def _set_config_override(self, value): + if hasattr(self, 'config'): + logger.warning('Configuration has already been loaded. Some override values may not have any effect.') + self._apply_override(value) + self._config_override = value + + def _apply_override(self, override=None): + if override is None: + override = self.config_override + for key, value in self.config_override: + if value is not None: + self.config[key] = value + + def __getattr__(self, item): + if item == 'config': + self._load_config() + return self.__dict__['config'] + if item in self.config: + return self.config[item] + raise AttributeError(f'No configuration item {item}') + + def _load_config(self): + # First, load the default config + logger.debug("Loading default configuration") + with pkg_resources.resource_stream(_cfg, 'etc/default.yaml') as f: + config: Mapping = _yaml.load(f) + # now work through all config files configured in the default config + # if they exist + if 'config_files' in config: + for fn in config['config_files']: + p = Path(expanduser(expandvars(fn))) + if p.exists(): + logger.info('Loading configuration file %s', p) + with p.open() as f: + config.update(_yaml.load(f)) + # now update using command line options etc. + self.config.update(self._config_override) + + # finally, let’s configure logging + self._init_logging() + + def _init_logging(self): + global logger + from logging.config import dictConfig + dictConfig(self.logging) + logger = logging.getLogger(__name__) + + def getLogger(self, name): + return _Proxy(logging.getLogger, name) + + def relative_path(self, absolute_path): + return Path(absolute_path).relative_to(self.path.data) + + +config = Configuration() diff --git a/src/macrogen/convert_table.py b/src/macrogen/convert_table.py index 81fb1a2..04b6265 100644 --- a/src/macrogen/convert_table.py +++ b/src/macrogen/convert_table.py @@ -8,6 +8,12 @@ from lxml.builder import ElementMaker import re +""" +This script can be used to convert an excel table in a special format to macrogenesis XML files. + +TODO move out of macrogenesis package? +""" + F = ElementMaker(namespace=faust.namespaces['f'], nsmap={None: faust.namespaces['f']}) def make_group(sigils, kind='', source='', comment='', notes=''): diff --git a/src/macrogen/datings.py b/src/macrogen/datings.py index 82fd929..24ffb05 100644 --- a/src/macrogen/datings.py +++ b/src/macrogen/datings.py @@ -1,27 +1,23 @@ """ Functions to parse the XML datings and build a graph out of them """ -import csv -import datetime from abc import ABCMeta, abstractmethod -from collections import namedtuple, defaultdict -from typing import List, Tuple, Optional, Any, Dict, Generator +from pathlib import Path +from time import strptime +from typing import List, Tuple, Optional, Any, Generator import networkx as nx -import requests +from datetime import date, timedelta from lxml import etree from more_itertools import pairwise -from . import faust -from .faust_logging import logging +from .bibliography import BiblSource from .uris import Witness, Reference +from .config import config -logger = logging.getLogger(__name__) +logger = config.getLogger(__name__) -# When there is only one notbefore/notafter border -- how much should we adjust -HALF_INTERVAL_CORRECTION = datetime.timedelta(365/2) - -def parse_datestr(datestr: str) -> datetime.date: +def parse_datestr(datestr: str) -> date: """ Parses a date str like 1799-01-01 to a date. @@ -31,7 +27,7 @@ def parse_datestr(datestr: str) -> datetime.date: if datestr is None: return None - dt = datetime.datetime.strptime(datestr, '%Y-%m-%d') + dt = strptime(datestr, '%Y-%m-%d') if dt is not None: return dt.date() else: @@ -48,118 +44,6 @@ def __init__(self, msg, element: Optional[etree._Element] = None): super().__init__(msg) -# Entry in the bibliography -BibEntry = namedtuple('BibEntry', ['uri', 'citation', 'reference', 'weight']) - - -def _parse_bibliography(url: str) -> Dict[str, BibEntry]: - """Parses the bibliography file at url. Returns a dictionary mapping an URI to a corresponding bibliography entry.""" - db: Dict[str, BibEntry] = {} - scores = _read_scores() - et = etree.fromstring(requests.get(url).content) - for bib in et.xpath('//f:bib', namespaces=faust.namespaces): - uri = bib.get('uri') - citation = bib.find('f:citation', namespaces=faust.namespaces).text - reference = bib.find('f:reference', namespaces=faust.namespaces).text - db[uri] = BibEntry(uri, citation, reference, scores[uri]) - return db - - -def _read_scores(): - """ - Parses the bibliography score file. - - Returns: - Map uri -> score - - """ - scores = defaultdict(lambda: 1) - try: - with open('bibscores.tsv', encoding='utf-8') as scorefile: - r = csv.reader(scorefile, delimiter='\t') - for row in r: - try: - scores[row[0]] = int(row[1]) - except ValueError as e: - logger.warning('Skipping row %s: %s', row, e) - except FileNotFoundError as e: - logger.warning('Could not read score file: %s. Will use default score', e) - return scores - - -bibliography = _parse_bibliography('https://raw.githubusercontent.com/faustedition/faust-gen-html/master/xslt/bibliography.xml') - -_bib_labels = { - 'faust://self': 'Faustedition', - 'faust://model/inscription': 'Inskription von', - 'faust://orphan/adoption': 'Datierungsinhalt für', - 'faust://heuristic': 'künstliche Datierung' -} - -class BiblSource: - """ - A bibliographic source in a macrogenesis XML file. - """ - - def __init__(self, uri, detail=''): - """ - Creates a bibliographic source. - Args: - uri: should be a faust://bibliography/ URI or one of the special values - detail: detail string like pages - """ - self.uri = uri - if detail is None: - detail = '' - self.detail = detail - self.weight = bibliography[uri].weight if uri in bibliography else 1 - - def __eq__(self, other): - if isinstance(other, BiblSource): - return self.uri == other.uri and self.detail == other.detail - else: - return super().__eq__(other) - - def __hash__(self): - return hash(self.uri) ^ hash(self.detail) - - def __str__(self): - result = self.citation - if self.detail is not None: - result += '\n' + self.detail - return result - - @property - def filename(self): - """ - A string representation of the source (w/o detail) that is usable as part of a filename. - """ - if self.uri.startswith('faust://bibliography'): - return self.uri.replace('faust://bibliography/', '') - else: - return self.uri.replace('faust://', '').replace('/', '-') - - @property - def citation(self): - """ - String representation of only the citation, w/o detail. - - Example: - Bohnenkamp 19994 - """ - if self.uri in bibliography: - return bibliography[self.uri].citation - elif self.uri in _bib_labels: - return _bib_labels[self.uri] - else: - return self.filename - - @property - def long_citation(self): - if self.uri in bibliography: - return bibliography[self.uri].reference - else: - return self.citation class _AbstractDating(metaclass=ABCMeta): @@ -176,11 +60,11 @@ def __init__(self, el: etree._Element): Args: el: The basic assertion element. This will usually be or . """ - self.items: List[Reference] = [Witness.get(uri) for uri in el.xpath('f:item/@uri', namespaces=faust.namespaces)] + self.items: List[Reference] = [Witness.get(uri) for uri in el.xpath('f:item/@uri', namespaces=config.namespaces)] self.sources = tuple(BiblSource(source.get('uri'), source.text) - for source in el.xpath('f:source', namespaces=faust.namespaces)) - self.comments = tuple(comment.text for comment in el.xpath('f:comment', namespaces=faust.namespaces)) - self.xmlsource: Tuple[str, int] = (faust.relative_path(el.getroottree().docinfo.URL), el.sourceline) + for source in el.xpath('f:source', namespaces=config.namespaces)) + self.comments = tuple(comment.text for comment in el.xpath('f:comment', namespaces=config.namespaces)) + self.xmlsource: Tuple[str, int] = (config.relative_path(el.getroottree().docinfo.URL), el.sourceline) self.ignore = el.get('ignore', 'no') == 'yes' @abstractmethod @@ -236,7 +120,7 @@ def __init__(self, el: etree._Element): raise InvalidDatingError('Backwards dating (%s), this would have caused a conflict' % self, el) @property - def start_attr(self) -> Tuple[str, datetime.date]: + def start_attr(self) -> Tuple[str, date]: """ The attribute representing the start of the interval. @@ -246,7 +130,7 @@ def start_attr(self) -> Tuple[str, datetime.date]: return _firstattr(self, 'from_', 'when', 'not_before') @property - def end_attr(self) -> Tuple[str, datetime.date]: + def end_attr(self) -> Tuple[str, date]: """ The attribute representing the end of the interval. @@ -256,22 +140,22 @@ def end_attr(self) -> Tuple[str, datetime.date]: return _firstattr(self, 'to', 'when', 'not_after') @property - def start(self) -> Optional[datetime.date]: + def start(self) -> Optional[date]: """The start date, regardless of the detailed semantics""" return self.start_attr[1] @property - def end(self) -> Optional[datetime.date]: + def end(self) -> Optional[date]: """The end date, regardless of the detailed semantics""" return self.end_attr[1] @property - def date_before(self) -> Optional[datetime.date]: - return self.start - datetime.timedelta(days=1) if self.start is not None else None + def date_before(self) -> Optional[date]: + return self.start - timedelta(days=1) if self.start is not None else None @property - def date_after(self) -> Optional[datetime.date]: - return self.end + datetime.timedelta(days=1) if self.end is not None else None + def date_after(self) -> Optional[date]: + return self.end + timedelta(days=1) if self.end is not None else None def __str__(self): if self.when is not None: @@ -287,14 +171,17 @@ def add_to_graph(self, G: nx.MultiDiGraph): for item in self.items: for source in self.sources: if self.start is not None: - G.add_edge(self.date_before, item, kind=self.start_attr[0], source=source, dating=self, xml=self.xmlsource, ignore=self.ignore, comments=self.comments) + G.add_edge(self.date_before, item, kind=self.start_attr[0], source=source, dating=self, + xml=self.xmlsource, ignore=self.ignore, comments=self.comments) if self.end is None and not self.ignore: - G.add_edge(item, self.date_before + HALF_INTERVAL_CORRECTION, kind='not_after', source=BiblSource('faust://heuristic'), xml=self.xmlsource) + G.add_edge(item, self.date_before + timedelta(config.half_interval_correction), kind='not_after', + source=BiblSource('faust://heuristic'), xml=self.xmlsource) if self.end is not None: - G.add_edge(item, self.date_after, kind=self.end_attr[0], source=source, dating=self, xml=self.xmlsource, ignore=self.ignore, comments=self.comments) + G.add_edge(item, self.date_after, kind=self.end_attr[0], source=source, dating=self, + xml=self.xmlsource, ignore=self.ignore, comments=self.comments) if self.start is None and not self.ignore: - G.add_edge(self.date_after - HALF_INTERVAL_CORRECTION, item, kind='not_before', source=BiblSource('faust://heuristic'), xml=self.xmlsource) - + G.add_edge(self.date_after - timedelta(config.half_interval_correction), item, kind='not_before', + source=BiblSource('faust://heuristic'), xml=self.xmlsource) class RelativeDating(_AbstractDating): @@ -334,9 +221,9 @@ def _parse_file(filename: str) -> Generator[_AbstractDating, None, None]: """ tree = etree.parse(filename) - for element in tree.xpath('//f:relation', namespaces=faust.namespaces): + for element in tree.xpath('//f:relation', namespaces=config.namespaces): yield RelativeDating(element) - for element in tree.xpath('//f:date', namespaces=faust.namespaces): + for element in tree.xpath('//f:date', namespaces=config.namespaces): try: yield AbsoluteDating(element) except InvalidDatingError as e: @@ -350,7 +237,7 @@ def _parse_files() -> Generator[_AbstractDating, None, None]: """ - for file in faust.macrogenesis_files(): + for file in Path(config.data, 'macrogenesis').rglob('**/*.xml'): yield from _parse_file(file) @@ -360,7 +247,7 @@ def add_timeline_edges(graph): Afterwards, each date node in the graph will have an edge to the next date represented in the graph. """ - date_nodes = sorted(node for node in graph.nodes if isinstance(node, datetime.date)) + date_nodes = sorted(node for node in graph.nodes if isinstance(node, date)) for earlier, later in pairwise(date_nodes): if earlier != later and (earlier, later) not in graph.edges: graph.add_edge(earlier, later, kind='timeline') @@ -382,10 +269,11 @@ def simplify_timeline(graph: nx.MultiDiGraph): 1798-01-01 -> 1798-02-01 `-----> H.x -----^ """ - date_nodes = sorted(node for node in graph.nodes if isinstance(node, datetime.date)) + date_nodes = sorted(node for node in graph.nodes if isinstance(node, date)) prev = None for node in date_nodes: - if prev is not None and graph.in_degree(node) == graph.out_degree(node) == 1 and isinstance(graph.successors(node)[0], datetime.date): + if prev is not None and graph.in_degree(node) == graph.out_degree(node) == 1 and isinstance( + graph.successors(node)[0], date): graph.remove_node(node) else: if prev is not None: diff --git a/src/macrogen/etc/default.yaml b/src/macrogen/etc/default.yaml new file mode 100644 index 0000000..60123ca --- /dev/null +++ b/src/macrogen/etc/default.yaml @@ -0,0 +1,24 @@ +## Default configuration for the macrogenesis software + +# configuration and data files, either paths or URIs +data: data/xml # XML Data, this is a folder +logging: # YAML file with logging configuration +styles: # YAML file with styling for the graphviz graphs +references: # CSV file featuring manual normalizations for references +bibscores: # TSV file (FIXME) with scores by source +sigils: # JSON file mapping URIs to sigils +paralipomena: # JSON file mapping paralipomena to sources +genetic_bar_graph: # JSON file +report_dir: target/macrogen # where to save reports and graphs + +bibliography: https://raw.githubusercontent.com/faustedition/faust-gen-html/master/xslt/bibliography.xml # The bibliography + +## Limits +half_interval_correction: 182.5 # if we only have a start or end date, the other limit is max. this many days away + +## Other data +namespaces: + f: http://www.faustedition.net/ns + tei: http://www.tei-c.org/ns/1.0 + + diff --git a/src/macrogen/graph.py b/src/macrogen/graph.py index aaa51d1..418b920 100644 --- a/src/macrogen/graph.py +++ b/src/macrogen/graph.py @@ -5,20 +5,20 @@ import csv from collections import defaultdict, Counter +from dataclasses import dataclass from datetime import date, timedelta from pathlib import Path -from typing import List, Callable, Any, Dict, Tuple, Union, Hashable, Set +from typing import List, Callable, Any, Dict, Tuple, Union -import dateutil import networkx as nx -from dataclasses import dataclass -from .datings import base_graph, BiblSource, parse_datestr -from .faust_logging import logging +from .bibliography import BiblSource +from .datings import base_graph, parse_datestr from .igraph_wrapper import to_igraph, nx_edges from .uris import Reference, Inscription, Witness, AmbiguousRef +from .config import config -logger = logging.getLogger(__name__) +logger = config.getLogger(__name__) EARLIEST = date(1749, 8, 28) LATEST = date.today() @@ -409,32 +409,32 @@ def macrogenesis_graphs() -> MacrogenesisInfo: all_conflicting_edges.extend(selfloops) logger.info('Building DAG from remaining data') - dag = working.copy() - dag.remove_edges_from(all_conflicting_edges) + result_graph = working.copy() + result_graph.remove_edges_from(all_conflicting_edges) - if not nx.is_directed_acyclic_graph(dag): + if not nx.is_directed_acyclic_graph(result_graph): logger.error('After removing %d conflicting edges, the graph is still not a DAG!', len(all_conflicting_edges)) - cycles = list(nx.simple_cycles(dag)) + cycles = list(nx.simple_cycles(result_graph)) logger.error('It contains %d simple cycles', len(cycles)) else: - logging.info('Double-checking removed edges ...') + logger.info('Double-checking removed edges ...') for u, v, k, attr in sorted(all_conflicting_edges, key=lambda edge: edge[3].get('weight', 1), reverse=True): - dag.add_edge(u, v, **attr) - if nx.is_directed_acyclic_graph(dag): + result_graph.add_edge(u, v, **attr) + if nx.is_directed_acyclic_graph(result_graph): all_conflicting_edges.remove((u, v, k, attr)) - logging.info('Added edge %s -> %s (%d) back without introducing a cycle.', u, v, attr.get('weight', 1)) + logger.info('Added edge %s -> %s (%d) back without introducing a cycle.', u, v, attr.get('weight', 1)) else: - dag.remove_edge(u, v) + result_graph.remove_edge(u, v) logger.info('Marking %d conflicting edges for deletion', len(all_conflicting_edges)) mark_edges_to_delete(base, all_conflicting_edges) logger.info('Removed %d of the original %d edges', len(all_conflicting_edges), len(working.edges)) - closure = nx.transitive_closure(dag) + closure = nx.transitive_closure(result_graph) add_inscription_links(base) - return MacrogenesisInfo(base, working, dag, closure, conflicts) + return MacrogenesisInfo(base, working, result_graph, closure, conflicts) def cleanup_graph(A: nx.MultiDiGraph) -> nx.MultiDiGraph: diff --git a/src/macrogen/main.py b/src/macrogen/main.py index 8386d5d..f8d3339 100755 --- a/src/macrogen/main.py +++ b/src/macrogen/main.py @@ -1,14 +1,13 @@ #!/usr/bin/env python3 -from .faust_logging import logging - import sys +from macrogen.config import config from . import graph from . import report from .visualize import render_all -logger = logging.getLogger('main') +logger = config.getLogger('main') def main(argv=sys.argv): diff --git a/src/macrogen/report.py b/src/macrogen/report.py index cf18fa2..86d2f9e 100644 --- a/src/macrogen/report.py +++ b/src/macrogen/report.py @@ -1,4 +1,5 @@ import json +from collections import defaultdict, Counter from datetime import date, datetime from itertools import chain, repeat, groupby from operator import itemgetter @@ -9,24 +10,20 @@ from lxml.etree import Comment from more_itertools import pairwise -from .faust_logging import logging - import csv -from collections.__init__ import defaultdict, Counter from html import escape from pathlib import Path from typing import Iterable, List, Dict, Mapping, Tuple, Sequence, Union, Generator, Any, Optional import networkx as nx -from . import faust -from .datings import BiblSource +from .config import config +from .bibliography import BiblSource from .graph import MacrogenesisInfo, pathlink, EARLIEST, LATEST, DAY from .uris import Reference, Witness, Inscription, UnknownRef, AmbiguousRef from .visualize import write_dot, simplify_graph -logger = logging.getLogger(__name__) -target = Path(faust.config.get('macrogenesis', 'output-dir')) +logger = config.getLogger(__name__) RELATION_LABELS = {'not_before': 'nicht vor', 'not_after': 'nicht nach', @@ -242,6 +239,7 @@ def write_html(filename: Path, content: str, head: str = None, breadcrumbs: List def report_components(graphs: MacrogenesisInfo): + target = config.path.report_dir logger.info('Writing component overview to %s', target) target.mkdir(parents=True, exist_ok=True) report = f"""

{len(graphs.conflicts)} stark zusammenhängende Komponenten

diff --git a/src/macrogen/uris.py b/src/macrogen/uris.py index d050096..3230578 100644 --- a/src/macrogen/uris.py +++ b/src/macrogen/uris.py @@ -4,8 +4,6 @@ """ from pathlib import Path -from .faust_logging import logging - import codecs import csv import json @@ -21,9 +19,9 @@ import requests from lxml import etree -from . import faust +from .config import config -logger = logging.getLogger(__name__) +logger = config.getLogger(__name__) def call_recorder(function=None, argument_picker=None): @@ -408,13 +406,14 @@ def filename(self): def _collect_wits(): items = defaultdict(list) # type: Dict[Union[Witness, Inscription, UnknownRef], List[Tuple[str, int]]] - for macrogenetic_file in faust.macrogenesis_files(): + macrogenesis_files = list(config.path.data.join('macrogenesis').rglob('**/*.xml')) + for macrogenetic_file in macrogenesis_files: tree = etree.parse(macrogenetic_file) # type: etree._ElementTree - for element in tree.xpath('//f:item', namespaces=faust.namespaces): # type: etree._Element + for element in tree.xpath('//f:item', namespaces=config.namespaces): # type: etree._Element uri = element.get('uri') wit = Witness.get(uri, allow_duplicate=True) items[wit].append((macrogenetic_file.split('macrogenesis/')[-1], element.sourceline)) - logger.info('Collected %d references in %d macrogenesis files', len(items), len(faust.macrogenesis_files())) + logger.info('Collected %d references in %d macrogenesis files', len(items), len(macrogenesis_files)) return items diff --git a/src/macrogen/visualize.py b/src/macrogen/visualize.py index 500da9c..3e73e21 100644 --- a/src/macrogen/visualize.py +++ b/src/macrogen/visualize.py @@ -1,3 +1,4 @@ +from collections import Sequence from datetime import date from multiprocessing.pool import Pool from pathlib import Path @@ -8,7 +9,8 @@ from pygraphviz import AGraph from tqdm import tqdm -from .datings import BiblSource, add_timeline_edges +from .datings import add_timeline_edges +from macrogen import BiblSource from .faust_logging import logging from .graph import pathlink from .uris import Reference @@ -53,16 +55,39 @@ def _simplify_attrs(attrs): attrs[key + '_detail'] = value.detail elif value is None: del attrs[key] + elif isinstance(value, Sequence) and not isinstance(value, str): + attrs[key] = " ".join(item.uri if hasattr(item, 'uri') else str(item) for item in value) elif type(value) not in {str, int, float, bool}: attrs[key] = str(value) def _load_style(filename): + """ + Loads a YAML Style file for :func:`write_doc`. + :param filename: Path to a YAML file with style directions + :return: dictionary with style directions + """ with open(filename, encoding='utf-8') as f: return yaml.load(f) -def write_dot(graph: nx.MultiDiGraph, target='base_graph.dot', style=_load_style('styles.yaml'), highlight=None, record='auto', edge_labels=True): +def write_dot(graph: nx.MultiDiGraph, target='base_graph.dot', style=_load_style('styles.yaml'), + highlight=None, record='auto', edge_labels=True): + """ + Writes a properly styled graphviz file for the given graph. + + Args: + graph: the subgraph to draw + target: dot file that should be written, may be a Path + style (dict): rules for styling the graph + highlight: if a node, highlight that in the graph. If a tuple of nodes, highlight the shortest path(s) from the + first to the second node + record: record in the queue for `render_all`. If ``"auto"`` dependent on graph size + edge_labels (bool): Should we paint edge labels? + + Returns: + None. + """ logger.info('Writing %s ...', target) target_path = Path(target) target_path.parent.mkdir(exist_ok=True, parents=True) diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..79d9214 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,22 @@ +import pytest + +from macrogen.config import CachedFile + + +@pytest.fixture(scope='session') +def cache_dir(tmp_path_factory): + return tmp_path_factory.mktemp('cache') + + +def test_cf_url(cache_dir): + cf = CachedFile('http://faustedition.net/data/paralipomena.js', cache_dir) + assert cf.is_url + assert cf.path.name == 'paralipomena.js' + + +def test_cf_open(cache_dir): + cf = CachedFile('http://faustedition.net/data/paralipomena.js', cache_dir) + with cf.open() as content: + text = content.read() + assert text + assert cf.path.exists() diff --git a/tests/test_main.py b/tests/test_main.py index 9111fba..fdf7f85 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,7 +1,7 @@ import networkx as nx import pytest -from macrogen.datings import BiblSource +from macrogen import BiblSource from macrogen.graph import collapse_edges from macrogen.uris import Witness