Skip to content

Commit

Permalink
recorder/warc_test: Validate WARCs with Warcat.
Browse files Browse the repository at this point in the history
  • Loading branch information
chfoo committed Jan 5, 2015
1 parent 71d26aa commit a65df06
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ install:
- if [[ $TRAVIS_PYTHON_VERSION != 'pypy3' ]]; then
pip install git+https://github.com/bastibe/lunatic-python.git@f3f68d2c8638c6f423912264aeb750f5f7eb14ee#egg=lunatic-python;
fi
- pip install coverage python-coveralls
- pip install coverage python-coveralls warcat


# command to run tests
Expand Down
43 changes: 43 additions & 0 deletions wpull/recorder/warc_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
import os.path
import subprocess
import sys
import re

from wpull.body import Body
from wpull.database.sqltable import URLTable
Expand All @@ -15,6 +18,28 @@


class TestWARC(BaseRecorderTest):

def validate_warc(self, filename, ignore_minor_error=False):
proc = subprocess.Popen(
[sys.executable, '-m', 'warcat', 'verify', filename],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)

stdout_data, stderr_data = proc.communicate()

output = stderr_data + stdout_data
output = output.decode('utf8', 'replace')

if not proc.returncode:
return

if not ignore_minor_error:
raise Exception('Validation failed {}'.format(output))
else:
if re.search(r'(VerifyProblem:.+ True\))|(.+Error:)', output):
raise Exception('Validation failed\n{}'.format(output))

def test_warc_recorder(self):
file_prefix = 'asdf'
warc_filename = 'asdf.warc'
Expand Down Expand Up @@ -124,6 +149,8 @@ def test_warc_recorder(self):

self.assertIn(b'KITTEH DOGE', data)

self.validate_warc(warc_filename)

def test_warc_recorder_ftp(self):
file_prefix = 'asdf'
warc_filename = 'asdf.warc'
Expand Down Expand Up @@ -190,6 +217,18 @@ def test_warc_recorder_ftp(self):
self.assertIn(b'> GIMMEH example.txt', warc_file_content)
self.assertIn(b'< 200 OK, no need to yell.', warc_file_content)

# Ignore Concurrent Record ID not seen yet
self.validate_warc(warc_filename, ignore_minor_error=True)

with open(warc_filename, 'r+b') as in_file:
# Intentionally modify the contents
in_file.seek(355)
in_file.write(b'f')

with self.assertRaises(Exception):
# Sanity check that it actually raises error on bad digest
self.validate_warc(warc_filename, ignore_minor_error=True)

def test_warc_recorder_max_size(self):
file_prefix = 'asdf'
cdx_filename = 'asdf.cdx'
Expand Down Expand Up @@ -274,6 +313,10 @@ def test_warc_recorder_max_size(self):

self.assertIn(b'FINISHED', meta_file_content)

self.validate_warc('asdf-00000.warc')
self.validate_warc('asdf-00001.warc')
self.validate_warc('asdf-meta.warc')

def test_warc_recorder_rollback(self):
warc_filename = 'asdf.warc'

Expand Down

0 comments on commit a65df06

Please sign in to comment.