Skip to content

Commit

Permalink
recorder.warc: Don't append to existing sequential WARC files.
Browse files Browse the repository at this point in the history
Don't append to existing sequential WARC files and then check the file
size to make the sequence number increment. Instead, find the lowest
sequence-numbered file that does not exist first.

Re #253
  • Loading branch information
chfoo committed Apr 1, 2015
1 parent 6335f27 commit ba8181e
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 10 deletions.
2 changes: 2 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Unreleased
==========

* Fixed: ``--regex-type`` to accept ``pcre`` instead of ``posix``. Regular expressions always use Python's regex library. Posix regex is not supported.
* Fixed: when using ``--warc-max-size`` and ``--warc-append``, it wrote to existing sequential WARC files unnecessarily.
* Changed: when using ``--warc-max-size`` and ``--warc-append``, the next sequential WARC file is created to avoid appending to corrupt files.
* Added: Open Graph and Twitter Card element links extraction.


Expand Down
36 changes: 26 additions & 10 deletions wpull/recorder/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,30 @@ def __init__(self, filename, params=None):
self._start_new_cdx_file()

def _start_new_warc_file(self, meta=False):
'''Create and set as current WARC file.'''
if self._params.max_size and not meta and self._params.appending:
while True:
self._warc_filename = self._generate_warc_filename()

if os.path.exists(self._warc_filename):
_logger.debug(__('Skip {0}', self._warc_filename))
self._sequence_num += 1
else:
break
else:
self._warc_filename = self._generate_warc_filename(meta=meta)

_logger.debug(__('WARC file at {0}', self._warc_filename))

if not self._params.appending:
wpull.util.truncate_file(self._warc_filename)

self._warcinfo_record = WARCRecord()
self._populate_warcinfo(self._params.extra_fields)
self.write_record(self._warcinfo_record)

def _generate_warc_filename(self, meta=False):
'''Return a suitable WARC filename.'''
if self._params.max_size is None:
sequence_name = ''
elif meta:
Expand All @@ -111,20 +135,12 @@ def _start_new_warc_file(self, meta=False):
else:
extension = 'warc'

self._warc_filename = '{0}{1}.{2}'.format(
return '{0}{1}.{2}'.format(
self._prefix_filename, sequence_name, extension
)

_logger.debug(__('WARC file at {0}', self._warc_filename))

if not self._params.appending:
wpull.util.truncate_file(self._warc_filename)

self._warcinfo_record = WARCRecord()
self._populate_warcinfo(self._params.extra_fields)
self.write_record(self._warcinfo_record)

def _start_new_cdx_file(self):
'''Create and set current CDX file.'''
self._cdx_filename = '{0}.cdx'.format(self._prefix_filename)

if not self._params.appending:
Expand Down
49 changes: 49 additions & 0 deletions wpull/recorder/warc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,3 +522,52 @@ def test_warc_move_max_size(self):
self.assertTrue(os.path.exists('./blah/asdf-00001.warc'))
self.assertTrue(os.path.exists('./blah/asdf-meta.warc'))
self.assertTrue(os.path.exists('./blah/' + cdx_filename))

def test_warc_max_size_and_append(self):
file_prefix = 'asdf'

with open('asdf-00000.warc', 'w'):
pass

with open('asdf-00001.warc', 'w'):
pass

warc_recorder = WARCRecorder(
file_prefix,
params=WARCRecorderParams(
compress=False,
max_size=1,
appending=True
),
)

request = HTTPRequest('http://example.com/1')
request.address = ('0.0.0.0', 80)
response = HTTPResponse(200, 'OK')
response.body = Body()

with wpull.util.reset_file_offset(response.body):
response.body.write(b'BLAH')

with warc_recorder.session() as session:
session.pre_request(request)
session.request_data(request.to_bytes())
session.request(request)
session.pre_response(response)
session.response_data(response.to_bytes())
session.response_data(response.body.content())
session.response(response)

warc_recorder.close()

self.assertTrue(os.path.exists('asdf-00000.warc'))
self.assertTrue(os.path.exists('asdf-00001.warc'))
self.assertTrue(os.path.exists('asdf-00002.warc'))
self.assertTrue(os.path.exists('asdf-00003.warc'))
self.assertTrue(os.path.exists('asdf-meta.warc'))

self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))

0 comments on commit ba8181e

Please sign in to comment.