Skip to content

Commit

Permalink
builder: Use PickleStream to keep sorted URLInfos.
Browse files Browse the repository at this point in the history
Closes #262
  • Loading branch information
chfoo committed Apr 19, 2015
1 parent ecd0a90 commit 41571d5
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 27 deletions.
1 change: 1 addition & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Unreleased

* Fixed: Connecting to sites with IPv4 & IPv6 support resulted in errors when IPv6 was not supported by the local network. Connections now use Happy Eyeballs Algorithm for IPv4 & IPv6 dual-stack support.
* Fixed: SQLAlchemy error with PyPy and SQLAlchemy 1.0.
* Fixed: Input URLs are not fetched in order. Regression since 1.1.
* Changed: ``--prefer-family=none`` is now default.
* Added: ``none`` as a choice to ``--prefer-family``.

Expand Down
83 changes: 57 additions & 26 deletions wpull/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import itertools
import logging
import os.path
import shelve
import socket
import ssl
import sys
Expand Down Expand Up @@ -76,7 +75,7 @@
BackwardFilenameFilter, ParentFilter,
FollowFTPFilter)
from wpull.urlrewrite import URLRewriter
from wpull.util import ASCIIStreamWriter
from wpull.util import ASCIIStreamWriter, GzipPickleStream
from wpull.waiter import LinearWaiter
from wpull.wrapper import CookieJarWrapper
from wpull.writer import (NullWriter, OverwriteFileWriter,
Expand Down Expand Up @@ -161,10 +160,9 @@ def __init__(self, args, unit_test=False):
'WebProcessorInstances': WebProcessorInstances,
'YoutubeDlCoprocessor': YoutubeDlCoprocessor,
})
self._input_urls_temp_dir = tempfile.TemporaryDirectory(
prefix='tmp-wpull', dir=os.getcwd())
self._input_urls_db = shelve.open(
os.path.join(self._input_urls_temp_dir.name, 'input_urls.db'))
self._input_urls_temp_file = tempfile.NamedTemporaryFile(
prefix='tmp-wpull-input_urls', dir=os.getcwd(),
suffix='.pickle')
self._ca_certs_file = None
self._file_log_handler = None
self._console_log_handler = None
Expand Down Expand Up @@ -200,15 +198,32 @@ def build(self):
resource_monitor = self._build_resource_monitor()

self._build_demux_document_scraper()
for url_info in self._build_input_urls():
self._input_urls_db[url_info.url] = url_info

with wpull.util.reset_file_offset(self._input_urls_temp_file):
input_urls_pickle_stream = GzipPickleStream(
file=self._input_urls_temp_file, mode='wb'
)

for url_info in self._build_input_urls():
input_urls_pickle_stream.dump(url_info)

input_urls_pickle_stream.close()
del input_urls_pickle_stream

statistics = self._factory.new('Statistics')
statistics.quota = self._args.quota

if self._args.quota:
for url_info in self._input_urls_db.values():
statistics.required_urls_db[url_info.url] = True
with wpull.util.reset_file_offset(self._input_urls_temp_file):
input_urls_pickle_stream = GzipPickleStream(
file=self._input_urls_temp_file, mode='rb'
)

for url_info in input_urls_pickle_stream.iter_load():
statistics.required_urls_db[url_info.url] = True

input_urls_pickle_stream.close()
del input_urls_pickle_stream

url_table = self._build_url_table()
processor = self._build_processor()
Expand All @@ -231,19 +246,24 @@ def build(self):
self._warn_unsafe_options()
self._warn_silly_options()

batch = []
with wpull.util.reset_file_offset(self._input_urls_temp_file):
batch = []
input_urls_pickle_stream = GzipPickleStream(
file=self._input_urls_temp_file, mode='rb'
)

for url_info in input_urls_pickle_stream.iter_load():
batch.append({'url': url_info.url})
if len(batch) > 1000:
url_table.add_many(batch)
batch = []

for url_info in self._input_urls_db.values():
batch.append({'url': url_info.url})
if len(batch) > 1000:
url_table.add_many(batch)
batch = []
url_table.add_many(batch)

url_table.add_many(batch)
input_urls_pickle_stream.close()
del input_urls_pickle_stream

self._input_urls_db.close()
self._input_urls_temp_dir.cleanup()
self._input_urls_temp_dir = None
self._input_urls_temp_file.close()

return self._factory['Application']

Expand Down Expand Up @@ -554,15 +574,26 @@ def _build_url_filters(self):
RecursiveFilter(
enabled=args.recursive, page_requisites=args.page_requisites
),
SpanHostsFilter(
(url_info for url_info in self._input_urls_db.values()),
enabled=args.span_hosts,
page_requisites='page-requisites' in args.span_hosts_allow,
linked_pages='linked-pages' in args.span_hosts_allow,
),
FollowFTPFilter(follow=args.follow_ftp),
]

with wpull.util.reset_file_offset(self._input_urls_temp_file):
input_urls_pickle_stream = GzipPickleStream(
file=self._input_urls_temp_file, mode='rb'
)

filters.append(
SpanHostsFilter(
input_urls_pickle_stream.iter_load(),
enabled=args.span_hosts,
page_requisites='page-requisites' in args.span_hosts_allow,
linked_pages='linked-pages' in args.span_hosts_allow,
)
)

input_urls_pickle_stream.close()
del input_urls_pickle_stream

if args.no_parent:
filters.append(ParentFilter())

Expand Down
2 changes: 1 addition & 1 deletion wpull/urlfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def __init__(self, input_url_infos, enabled=False,
self._page_requisites = page_requisites
self._linked_pages = linked_pages
self._base_urls = frozenset(
[url_info.hostname for url_info in input_url_infos]
url_info.hostname for url_info in input_url_infos
)

def test(self, url_info, url_table_record):
Expand Down

0 comments on commit 41571d5

Please sign in to comment.