Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Slots for configuration helper classes #96

Merged
merged 4 commits into from Nov 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/source/code.rst
Expand Up @@ -20,6 +20,15 @@ Configuration options to change how and what goose3 extracts and parses.
.. autoclass:: goose3.Configuration
:members:

Configuration Helper Classes
-------------------------------------------------------------------------------

.. autoclass:: goose3.configuration.ArticleContextPattern

.. autoclass:: goose3.configuration.AuthorPattern

.. autoclass:: goose3.configuration.PublishDatePattern


.. _articledocs:

Expand Down
25 changes: 24 additions & 1 deletion docs/source/quickstart.rst
Expand Up @@ -103,7 +103,7 @@ one would like to change:
with Goose(config) as g:
pass

Or if there are few changes:
Or if there are only a few changes:
::

from goose3 import Goose
Expand All @@ -124,6 +124,29 @@ created:
g.config.browser_user_agent = 'Mozilla 5.0'


Configuration Helper Classes
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
For some, more complex configuration options, there are classes available to
help ensure that the correct values are provided. One does not need to use the
provided classes, but it does make things a bit simpler.

::

from goose3 import Goose
from goose3.configuration import Configuration, ArticleContextPattern, PublishDatePattern, AuthorPattern

config = Configuration()

# we know of a particular article location in the site we are pulling from
config.known_context_patterns = ArticleContextPattern(attr="id", value="my-site-article")

# publish date
config.known_publish_date_tags = PublishDatePattern(attr="id", value="pubdate", content="content")

# author
config.known_author_patterns = AuthorPattern(attr="id", value="writer", content="content")


Reading Results
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Expand Down
65 changes: 55 additions & 10 deletions goose3/configuration.py
Expand Up @@ -34,8 +34,21 @@


class ArticleContextPattern(object):
''' Help ensure correctly generated article context patterns

def __init__(self, attr=None, value=None, tag=None, domain=None):
Args:
attr (str): The attribute type: class, id, etc
value (str): The value of the attribute
tag (str): The type of tag, such as `article` that contains the \
main article body
domain (str): The domain to which this pattern pertains (optional)
Note:
Must provide, at a minimum, (attr and value) or (tag)
'''

__slots__ = ['attr', 'value', 'tag', 'domain']

def __init__(self, *, attr=None, value=None, tag=None, domain=None):
if (not attr and not value) and not tag:
raise Exception("`attr` and `value` must be set or `tag` must be set")
self.attr = attr
Expand All @@ -59,8 +72,25 @@ def __repr__(self):


class PublishDatePattern(object):

def __init__(self, attr=None, value=None, content=None, subcontent=None,
''' Ensure correctly formed publish date patterns; to be used in conjuntion
with the configuration `known_publish_date_tags` property

Args:
attr (str): The attribute type: class, id, etc
value (str): The value of the attribute
content (str): The name of another attribute (of the element) that \
contains the value
subcontent (str): The name of a json object key (optional)
tag (str): The type of tag, such as `time` that contains the \
publish date
domain (str): The domain to which this pattern pertains (optional)
Note:
Must provide, at a minimum, (attr and value) or (tag)
'''

__slots__ = ['attr', 'value', 'content', 'subcontent', 'tag', 'domain']

def __init__(self, *, attr=None, value=None, content=None, subcontent=None,
tag=None, domain=None):
if (not attr and not value) and not tag:
raise Exception("`attr` and `value` must be set or `tag` must be set")
Expand Down Expand Up @@ -93,6 +123,20 @@ def __repr__(self):


class AuthorPattern(object):
''' Ensures that the author patterns are correctly formed for use with the
`known_author_patterns` of configuration

Args:
attr (str): The attribute type: class, id, etc
value (str): The value of the attribute
content (str): The name of another attribute (of the element) that \
contains the value
tag (str): The type of tag, such as `author` that contains the \
author information
subpattern (str): A subpattern for elements within the main attribute
'''

__slots__ = ['attr', 'value', 'content', 'tag', 'subpattern']

def __init__(self, *, attr=None, value=None, content=None, tag=None, subpattern=None):
if (not attr and not value) and not tag:
Expand Down Expand Up @@ -170,7 +214,8 @@ def known_context_patterns(self):

@known_context_patterns.setter
def known_context_patterns(self, val):
''' val must be a dictionary or list of dictionaries
''' val must be an ArticleContextPattern, a dictionary, or list of \
dictionaries
e.g., {'attr': 'class', 'value': 'my-article-class'}
or [{'attr': 'class', 'value': 'my-article-class'},
{'attr': 'id', 'value': 'my-article-id'}]
Expand Down Expand Up @@ -270,7 +315,7 @@ def known_author_patterns(self, val):
'''

def create_pat_from_dict(val):
'''Helper function used to create an PublishDatePattern from a dictionary
'''Helper function used to create an AuthorPatterns from a dictionary
'''
if "tag" in val:
pat = AuthorPattern(tag=val["tag"])
Expand All @@ -287,15 +332,15 @@ def create_pat_from_dict(val):

if isinstance(val, list):
self._known_author_patterns = [
x if isinstance(x, PublishDatePattern) else create_pat_from_dict(x)
for x in val
] + self.known_author_patterns
elif isinstance(val, PublishDatePattern):
x if isinstance(x, AuthorPattern) else create_pat_from_dict(x)
for x in val
] + self.known_author_patterns
elif isinstance(val, AuthorPattern):
self._known_author_patterns.insert(0, val)
elif isinstance(val, dict):
self._known_author_patterns.insert(0, create_pat_from_dict(val))
else:
raise Exception("Unknown type: {}. Use a AuthorPattern.".format(type(val)))
raise Exception("Unknown type: {}. Use an AuthorPattern.".format(type(val)))

@property
def strict(self):
Expand Down
2 changes: 1 addition & 1 deletion goose3/extractors/schema.py
Expand Up @@ -37,7 +37,7 @@ class SchemaExtractor(BaseExtractor):
def extract(self):
node = self.article.doc
metas = self.parser.getElementsByTag(node, 'script', attr='type',
value='application/ld\+json')
value='application/ld\\+json')
for meta in metas:
try:
content = json.loads(meta.text_content())
Expand Down