Skip to content

Commit

Permalink
recoder.warc: Raise OSError if journal files are found.
Browse files Browse the repository at this point in the history
Closes #253
  • Loading branch information
chfoo committed Apr 2, 2015
1 parent 3b15fcb commit 76cf49c
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Unreleased
* Fixed: ``--regex-type`` to accept ``pcre`` instead of ``posix``. Regular expressions always use Python's regex library. Posix regex is not supported.
* Fixed: when using ``--warc-max-size`` and ``--warc-append``, it wrote to existing sequential WARC files unnecessarily.
* Changed: when using ``--warc-max-size`` and ``--warc-append``, the next sequential WARC file is created to avoid appending to corrupt files.
* Changed: WARC file writing to use journal files and refuse to start program if any journals exist. This avoids corrupting files through naive use of ``--warc-append`` and allow for future automated recovery.
* Added: Open Graph and Twitter Card element links extraction.


Expand Down
12 changes: 12 additions & 0 deletions wpull/recorder/warc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import glob
from tempfile import NamedTemporaryFile
import contextlib
import gettext
Expand Down Expand Up @@ -90,6 +91,8 @@ def __init__(self, filename, params=None):
self._warc_filename = None
self._cdx_filename = None

self._check_journals_and_maybe_raise()

if params.log:
self._setup_log()

Expand All @@ -98,6 +101,13 @@ def __init__(self, filename, params=None):
if self._params.cdx:
self._start_new_cdx_file()

def _check_journals_and_maybe_raise(self):
'''Check if any journal files exist and raise an error.'''
files = list(glob.glob(self._prefix_filename + '*-wpullinc'))

if files:
raise OSError('WARC file {} is incomplete.'.format(files[0]))

def _start_new_warc_file(self, meta=False):
'''Create and set as current WARC file.'''
if self._params.max_size and not meta and self._params.appending:
Expand Down Expand Up @@ -253,6 +263,8 @@ def write_record(self, record):
else:
open_func = open

# Use getsize to get actual file size. Avoid tell() because it may
# not be the raw file position.
if os.path.exists(self._warc_filename):
before_offset = os.path.getsize(self._warc_filename)
else:
Expand Down
15 changes: 15 additions & 0 deletions wpull/recorder/warc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,21 @@ def __iter__(self):

self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

def test_warc_recorder_journal_raise_error(self):
warc_filename = 'asdf.warc'
warc_prefix = 'asdf'

with open(warc_filename + '-wpullinc', 'w'):
pass

with self.assertRaises(OSError):
WARCRecorder(
warc_prefix,
params=WARCRecorderParams(
compress=False,
)
)

def test_cdx_dedup(self):
url_table = URLTable()
warc_recorder = WARCRecorder(
Expand Down

0 comments on commit 76cf49c

Please sign in to comment.