From a626137e96b358c46f16657996bd0f44f666187f Mon Sep 17 00:00:00 2001 From: MaxDall Date: Sun, 10 Dec 2023 14:24:48 +0100 Subject: [PATCH 1/3] makes article properties overridable by parsers --- src/fundus/parser/base_parser.py | 16 +++++++++---- src/fundus/scraping/article.py | 10 ++++++-- src/fundus/utils/caching.py | 39 ++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 6 deletions(-) create mode 100644 src/fundus/utils/caching.py diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py index d753431c..dde77e9c 100644 --- a/src/fundus/parser/base_parser.py +++ b/src/fundus/parser/base_parser.py @@ -76,8 +76,9 @@ def __repr__(self): class Attribute(RegisteredFunction): - def __init__(self, func: Callable[[object], Any], priority: Optional[int], validate: bool): - self.validate = validate + def __init__(self, func: Callable[[object], Any], priority: Optional[int], validate: bool, overwrite: bool = False): + self.validate = validate if not overwrite else False + self.overwrite = overwrite super(Attribute, self).__init__(func=func, priority=priority) @@ -88,7 +89,10 @@ def __init__(self, func: Callable[[object], Any], priority: Optional[int]): def _register(cls, factory: Type[RegisteredFunction], **kwargs): def wrapper(func): - return functools.update_wrapper(factory(func, **kwargs), func) + try: + return functools.update_wrapper(factory(func, **kwargs), func) + except TypeError as err: + raise err # _register was called with parenthesis if cls is None: @@ -102,6 +106,10 @@ def attribute(cls=None, /, *, priority: Optional[int] = None, validate: bool = T return _register(cls, factory=Attribute, priority=priority, validate=validate) +def overwrite_attribute(cls): + return _register(cls, factory=Attribute, priority=None, validate=False, overwrite=True) + + def function(cls=None, /, *, priority: Optional[int] = None): return _register(cls, factory=Function, priority=priority) @@ -137,7 +145,7 @@ def validated(self) -> "AttributeCollection": @property def unvalidated(self) -> "AttributeCollection": - return AttributeCollection(*[attr for attr in self.functions if not attr.validate]) + return AttributeCollection(*[attr for attr in self.functions if not attr.validate and not attr.overwrite]) class FunctionCollection(RegisteredFunctionCollection[Function]): diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py index e8e6a262..e302a4de 100644 --- a/src/fundus/scraping/article.py +++ b/src/fundus/scraping/article.py @@ -11,6 +11,7 @@ from fundus.logging.logger import basic_logger from fundus.parser import ArticleBody from fundus.scraping.html import HTML +from fundus.utils.caching import cached_attribute @dataclass(frozen=True) @@ -41,12 +42,17 @@ def from_extracted(cls, html: HTML, extracted: Dict[str, Any], exception: Option return article - @property + @cached_attribute def plaintext(self) -> Optional[str]: return str(self.body) if self.body else None - @property + @cached_attribute def lang(self) -> Optional[str]: + """ + computes used language + Returns: + + """ language: Optional[str] = None if self.plaintext: diff --git a/src/fundus/utils/caching.py b/src/fundus/utils/caching.py new file mode 100644 index 00000000..93071aa8 --- /dev/null +++ b/src/fundus/utils/caching.py @@ -0,0 +1,39 @@ +import functools + + +class _CachedAttribute(object): + """Computes attribute value and caches it in the instance. + From https://stackoverflow.com/questions/7388258/replace-property-for-perfomance-gain?noredirect=1&lq=1 + Tweaked a bit to be used with a wrapper. + """ + + def __init__(self, method): + self.method = method + + def __get__(self, inst, cls): + if inst is None: + return self + result = self.method(inst) + object.__setattr__(inst, self.__name__, result) # type: ignore[attr-defined] + return result + + +# This was implemented in order to +def cached_attribute(attribute): + """Decorate attributes to be cached. + + This works like `cached_property`, but instead of `property` or `cached_property`, the decorated attribute + can be overwritten. + + Args: + attribute: The attribute to decorate. + + Returns: + A wrapped _CachedAttribute instance. + + """ + + def wrapper(func): + return functools.update_wrapper(_CachedAttribute(func), func) + + return wrapper(attribute) From 1e506e16aa14f1cd7f15b182a7e9c6a99a8c5e07 Mon Sep 17 00:00:00 2001 From: MaxDall Date: Sun, 10 Dec 2023 14:32:56 +0100 Subject: [PATCH 2/3] expose `overwrite_attribute` in base_parser.py --- src/fundus/parser/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/fundus/parser/__init__.py b/src/fundus/parser/__init__.py index 2e8622df..2a619c78 100644 --- a/src/fundus/parser/__init__.py +++ b/src/fundus/parser/__init__.py @@ -1,4 +1,10 @@ -from .base_parser import BaseParser, ParserProxy, attribute, function +from .base_parser import ( + BaseParser, + ParserProxy, + attribute, + function, + overwrite_attribute, +) from .data import ArticleBody -__all__ = ["ParserProxy", "BaseParser", "attribute", "function", "ArticleBody"] +__all__ = ["ParserProxy", "BaseParser", "attribute", "function", "overwrite_attribute", "ArticleBody"] From 85a2d68b6a0ec06f1e22e3382a32158c81a5db09 Mon Sep 17 00:00:00 2001 From: MaxDall Date: Thu, 11 Jan 2024 13:38:22 +0100 Subject: [PATCH 3/3] remove leftover try:except --- src/fundus/parser/base_parser.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/fundus/parser/base_parser.py b/src/fundus/parser/base_parser.py index dde77e9c..2724d496 100644 --- a/src/fundus/parser/base_parser.py +++ b/src/fundus/parser/base_parser.py @@ -89,10 +89,7 @@ def __init__(self, func: Callable[[object], Any], priority: Optional[int]): def _register(cls, factory: Type[RegisteredFunction], **kwargs): def wrapper(func): - try: - return functools.update_wrapper(factory(func, **kwargs), func) - except TypeError as err: - raise err + return functools.update_wrapper(factory(func, **kwargs), func) # _register was called with parenthesis if cls is None: