From 2e7de8eb93e86baad3b04ffe257744693a0acc3e Mon Sep 17 00:00:00 2001 From: Tyler Barrus Date: Thu, 1 Nov 2018 19:58:53 -0400 Subject: [PATCH 1/4] use slots for context classes --- goose3/configuration.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/goose3/configuration.py b/goose3/configuration.py index 23c73ef..3dddaf9 100644 --- a/goose3/configuration.py +++ b/goose3/configuration.py @@ -34,7 +34,8 @@ class ArticleContextPattern(object): - + ''' + ''' def __init__(self, attr=None, value=None, tag=None, domain=None): if (not attr and not value) and not tag: raise Exception("`attr` and `value` must be set or `tag` must be set") @@ -60,6 +61,8 @@ def __repr__(self): class PublishDatePattern(object): + __slots__ = ['attr', 'value', 'content', 'subcontent', 'tag', 'domain'] + def __init__(self, attr=None, value=None, content=None, subcontent=None, tag=None, domain=None): if (not attr and not value) and not tag: @@ -94,6 +97,8 @@ def __repr__(self): class AuthorPattern(object): + __slots__ = ['attr', 'value', 'content', 'tag', 'subpattern'] + def __init__(self, *, attr=None, value=None, content=None, tag=None, subpattern=None): if (not attr and not value) and not tag: raise Exception("`attr` and `value` must be set or `tag` must be set") @@ -170,7 +175,8 @@ def known_context_patterns(self): @known_context_patterns.setter def known_context_patterns(self, val): - ''' val must be a dictionary or list of dictionaries + ''' val must be an ArticleContextPattern, a dictionary, or list of \ + dictionaries e.g., {'attr': 'class', 'value': 'my-article-class'} or [{'attr': 'class', 'value': 'my-article-class'}, {'attr': 'id', 'value': 'my-article-id'}] From f4944c4388352c74002800cbffab8ed351a6872d Mon Sep 17 00:00:00 2001 From: Tyler Barrus Date: Sat, 3 Nov 2018 12:05:22 -0400 Subject: [PATCH 2/4] add docstrings to configuration helper classes --- goose3/configuration.py | 57 ++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/goose3/configuration.py b/goose3/configuration.py index 3dddaf9..33c3c25 100644 --- a/goose3/configuration.py +++ b/goose3/configuration.py @@ -34,9 +34,21 @@ class ArticleContextPattern(object): + ''' Help ensure correctly generated article context patterns + + Args: + attr (str): The attribute type: class, id, etc + value (str): The value of the attribute + tag (str): The type of tag, such as `article` that contains the \ + main article body + domain (str): The domain to which this pattern pertains (optional) + Note: + Must provide, at a minimum, (attr and value) or (tag) ''' - ''' - def __init__(self, attr=None, value=None, tag=None, domain=None): + + __slots__ = ['attr', 'value', 'tag', 'domain'] + + def __init__(self, *, attr=None, value=None, tag=None, domain=None): if (not attr and not value) and not tag: raise Exception("`attr` and `value` must be set or `tag` must be set") self.attr = attr @@ -60,10 +72,25 @@ def __repr__(self): class PublishDatePattern(object): + ''' Ensure correctly formed publish date patterns; to be used in conjuntion + with the configuration `known_publish_date_tags` property + + Args: + attr (str): The attribute type: class, id, etc + value (str): The value of the attribute + content (str): The name of another attribute (of the element) that \ + contains the value + subcontent (str): The name of a json object key (optional) + tag (str): The type of tag, such as `time` that contains the \ + publish date + domain (str): The domain to which this pattern pertains (optional) + Note: + Must provide, at a minimum, (attr and value) or (tag) + ''' __slots__ = ['attr', 'value', 'content', 'subcontent', 'tag', 'domain'] - def __init__(self, attr=None, value=None, content=None, subcontent=None, + def __init__(self, *, attr=None, value=None, content=None, subcontent=None, tag=None, domain=None): if (not attr and not value) and not tag: raise Exception("`attr` and `value` must be set or `tag` must be set") @@ -96,6 +123,18 @@ def __repr__(self): class AuthorPattern(object): + ''' Ensures that the author patterns are correctly formed for use with the + `known_author_patterns` of configuration + + Args: + attr (str): The attribute type: class, id, etc + value (str): The value of the attribute + content (str): The name of another attribute (of the element) that \ + contains the value + tag (str): The type of tag, such as `author` that contains the \ + author information + subpattern (str): A subpattern for elements within the main attribute + ''' __slots__ = ['attr', 'value', 'content', 'tag', 'subpattern'] @@ -276,7 +315,7 @@ def known_author_patterns(self, val): ''' def create_pat_from_dict(val): - '''Helper function used to create an PublishDatePattern from a dictionary + '''Helper function used to create an AuthorPatterns from a dictionary ''' if "tag" in val: pat = AuthorPattern(tag=val["tag"]) @@ -293,15 +332,15 @@ def create_pat_from_dict(val): if isinstance(val, list): self._known_author_patterns = [ - x if isinstance(x, PublishDatePattern) else create_pat_from_dict(x) - for x in val - ] + self.known_author_patterns - elif isinstance(val, PublishDatePattern): + x if isinstance(x, AuthorPattern) else create_pat_from_dict(x) + for x in val + ] + self.known_author_patterns + elif isinstance(val, AuthorPattern): self._known_author_patterns.insert(0, val) elif isinstance(val, dict): self._known_author_patterns.insert(0, create_pat_from_dict(val)) else: - raise Exception("Unknown type: {}. Use a AuthorPattern.".format(type(val))) + raise Exception("Unknown type: {}. Use an AuthorPattern.".format(type(val))) @property def strict(self): From 4c1c1a5f1e1dce0a07e5003fd30bd6b1d38d3e75 Mon Sep 17 00:00:00 2001 From: Tyler Barrus Date: Sat, 3 Nov 2018 12:06:08 -0400 Subject: [PATCH 3/4] fix flake8 issue with \ --- goose3/extractors/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose3/extractors/schema.py b/goose3/extractors/schema.py index ed6b3d6..6791876 100644 --- a/goose3/extractors/schema.py +++ b/goose3/extractors/schema.py @@ -37,7 +37,7 @@ class SchemaExtractor(BaseExtractor): def extract(self): node = self.article.doc metas = self.parser.getElementsByTag(node, 'script', attr='type', - value='application/ld\+json') + value='application/ld\\+json') for meta in metas: try: content = json.loads(meta.text_content()) From 2dd3998b25be16932c802d487e25b1a43b3b9e93 Mon Sep 17 00:00:00 2001 From: Tyler Barrus Date: Sat, 3 Nov 2018 12:25:24 -0400 Subject: [PATCH 4/4] update docs --- docs/source/code.rst | 9 +++++++++ docs/source/quickstart.rst | 25 ++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/source/code.rst b/docs/source/code.rst index 45f4b2b..2325a16 100644 --- a/docs/source/code.rst +++ b/docs/source/code.rst @@ -20,6 +20,15 @@ Configuration options to change how and what goose3 extracts and parses. .. autoclass:: goose3.Configuration :members: +Configuration Helper Classes +------------------------------------------------------------------------------- + +.. autoclass:: goose3.configuration.ArticleContextPattern + +.. autoclass:: goose3.configuration.AuthorPattern + +.. autoclass:: goose3.configuration.PublishDatePattern + .. _articledocs: diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 31a3d6d..0d5caef 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -103,7 +103,7 @@ one would like to change: with Goose(config) as g: pass -Or if there are few changes: +Or if there are only a few changes: :: from goose3 import Goose @@ -124,6 +124,29 @@ created: g.config.browser_user_agent = 'Mozilla 5.0' +Configuration Helper Classes +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +For some, more complex configuration options, there are classes available to +help ensure that the correct values are provided. One does not need to use the +provided classes, but it does make things a bit simpler. + +:: + + from goose3 import Goose + from goose3.configuration import Configuration, ArticleContextPattern, PublishDatePattern, AuthorPattern + + config = Configuration() + + # we know of a particular article location in the site we are pulling from + config.known_context_patterns = ArticleContextPattern(attr="id", value="my-site-article") + + # publish date + config.known_publish_date_tags = PublishDatePattern(attr="id", value="pubdate", content="content") + + # author + config.known_author_patterns = AuthorPattern(attr="id", value="writer", content="content") + + Reading Results +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++