Skip to content

Commit

Permalink
Add --page-requisites-level
Browse files Browse the repository at this point in the history
Closes #151
  • Loading branch information
chfoo committed Jan 3, 2015
1 parent 4b8d0de commit f6fa0bf
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 5 deletions.
6 changes: 6 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ What's New
==========

* Fixed ``--page-requisites`` exceeding ``--level``.
* Fixed infinite page requisite recursion when using ``--span-hosts-allow page-requisites``.
* Added ``--page-requisites-level``. The default max recursion depth on page requisites is now 5.

Database Schema:

* URL ``inline`` column is now an integer.


0.1004.1 (2015-01-03)
Expand Down
1 change: 1 addition & 0 deletions wpull/app_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def test_app_args(self):
'--bind-address', '127.0.0.1',
'--html-parser', 'html5lib',
'--link-extractors', 'html',
'--page-requisites-level', '5',
])
with cd_tempdir():
builder = Builder(args, unit_test=True)
Expand Down
7 changes: 5 additions & 2 deletions wpull/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,11 @@ def _build_url_filters(self):
if args.tries:
filters.append(TriesFilter(args.tries))

if args.level and args.recursive:
filters.append(LevelFilter(args.level))
if args.level and args.recursive or args.page_requisites_level:
filters.append(
LevelFilter(args.level,
inline_max_depth=args.page_requisites_level)
)

if args.accept_regex or args.reject_regex:
filters.append(RegexFilter(args.accept_regex, args.reject_regex))
Expand Down
2 changes: 1 addition & 1 deletion wpull/coprocessor/phantomjs.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ def _new_url_record(self, url_info):
self._url_item.url_record.top_url,
None, # status_code
self._url_item.url_info.url, # referrer
True, # inline
1, # inline
None, # link_type
None, # post_data
None # filename
Expand Down
9 changes: 7 additions & 2 deletions wpull/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,13 @@ class URLRecord(_URLRecordType):
is typically the URL supplied at the start of the program.
status_code (int): The HTTP status code.
referrer (str): The parent URL that linked to this URL.
inline (bool): Whether this URL was an embedded object (such as an
inline (int): Whether this URL was an embedded object (such as an
image or a stylesheet) of the parent URL.
The value represents the recursive depth of the object. For
example, an iframe is depth 1 and the images in the iframe
is depth 2.
link_type (str): Describes the document type. Values are:
* ``html``: HTML document
Expand Down Expand Up @@ -190,7 +195,7 @@ def add_inline_url_infos(self, url_infos, link_type=None,
_logger.debug(__('Adding inline URLs {0}', inline_urls))
self._url_table.add_many(
inline_urls,
inline=True,
inline=self._url_record.inline or 0 + 1,
level=self._url_record.level + 1,
referrer=self._url_record.url,
top_url=self._url_record.top_url or self._url_record.url,
Expand Down
7 changes: 7 additions & 0 deletions wpull/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,13 @@ def _add_recursive_args(self):
action='store_true',
help=_('download objects embedded in pages')
)
group.add_argument(
'--page-requisites-level',
metavar='NUMBER',
type=self.int_0_inf,
default=5,
help=_('limit page-requisites recursion depth to NUMBER')
)
# self.add_argument(
# '--strict-comments',
# action='store_true',
Expand Down

0 comments on commit f6fa0bf

Please sign in to comment.