extract: --skip-errors ignores corrupted chunks (w/ log message)

enkore · Apr 11, 2016 · 09b21b1 · 09b21b1
1 parent 378140a
commit 09b21b1
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 8 deletions.
diff --git a/borg/archive.py b/borg/archive.py
@@ -21,7 +21,7 @@
     parse_timestamp, to_localtime, format_time, format_timedelta, \
     Manifest, Statistics, decode_dict, make_path_safe, StableDict, int_to_bigint, bigint_to_int, \
     ProgressIndicatorPercent, ChunkIteratorFileWrapper, remove_surrogates, log_multi, DASHES, \
-    PathPrefixPattern, FnmatchPattern, open_item, file_status, format_file_size, consume
+    PathPrefixPattern, FnmatchPattern, open_item, file_status, format_file_size, consume, IntegrityError
 from .repository import Repository
 from .platform import acl_get, acl_set
 from .chunker import Chunker
@@ -304,7 +304,7 @@ def add_file_chunks(chunks):
         return stats
 
     def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False,
-                     hardlink_masters=None, original_path=None):
+                     hardlink_masters=None, original_path=None, skip_integrity_errors=False):
         """
         Extract archive item.
 
@@ -315,15 +315,18 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp
         :param sparse: write sparse files (chunk-granularity, independent of the original being sparse)
         :param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly
         :param original_path: b'path' key as stored in archive
+        :param skip_integrity_errors: skip over corrupted chunks instead of raising IntegrityError (ignored for
+        dry_run and stdout)
         """
+
         if dry_run or stdout:
             if b'chunks' in item:
                 for data in self.pipeline.fetch_many([c[0] for c in item[b'chunks']], is_preloaded=True):
                     if stdout:
                         sys.stdout.buffer.write(data)
                 if stdout:
                     sys.stdout.buffer.flush()
-            return
+            return True
 
         original_path = original_path or item[b'path']
         dest = self.cwd
@@ -353,16 +356,36 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp
                     os.unlink(path)
                 if not hardlink_masters:
                     os.link(source, path)
-                    return
+                    return True
                 item[b'chunks'], link_target = hardlink_masters[item[b'source']]
                 if link_target:
                     # Hard link was extracted previously, just link
                     os.link(link_target, path)
-                    return
+                    return True
                 # Extract chunks, since the item which had the chunks was not extracted
             with open(path, 'wb') as fd:
                 ids = [c[0] for c in item[b'chunks']]
-                for data in self.pipeline.fetch_many(ids, is_preloaded=True):
+                chunk_index = -1
+                chunk_iterator = self.pipeline.fetch_many(ids, is_preloaded=True)
+                skipped_errors = False
+                while True:
+                    try:
+                        chunk_index += 1
+                        data = next(chunk_iterator)
+                    except StopIteration:
+                        break
+                    except IntegrityError as ie:
+                        if not skip_integrity_errors:
+                            raise
+                        chunk_id, size, _ = item[b'chunks'][chunk_index]
+                        chunk_id = hexlify(chunk_id).decode('ascii')
+                        logger.warning('%s: chunk %s: %s', remove_surrogates(item[b'path']), chunk_id, ie)
+                        fd.seek(size, 1)
+                        skipped_errors = True
+                        # restart chunk data generator
+                        ids = [c[0] for c in item[b'chunks'][chunk_index + 1:]]
+                        chunk_iterator = self.pipeline.fetch_many(ids, is_preloaded=True)
+                        continue
                     if sparse and self.zeros.startswith(data):
                         # all-zero chunk: create a hole in a sparse file
                         fd.seek(len(data), 1)
@@ -375,6 +398,7 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp
             if hardlink_masters:
                 # Update master entry with extracted file path, so that following hardlinks don't extract twice.
                 hardlink_masters[item.get(b'source') or original_path] = (None, path)
+            return not skipped_errors
         elif stat.S_ISDIR(mode):
             if not os.path.exists(path):
                 os.makedirs(path)
@@ -401,6 +425,7 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp
             self.restore_attrs(path, item)
         else:
             raise Exception('Unknown archive item type %r' % item[b'mode'])
+        return True
 
     def restore_attrs(self, path, item, symlink=False, fd=None):
         xattrs = item.get(b'xattrs', {})

diff --git a/borg/archiver.py b/borg/archiver.py
@@ -385,6 +385,7 @@ def do_extract(self, args, repository, manifest, key, archive):
 
         output_list = args.output_list
         dry_run = args.dry_run
+        skip_errors = args.skip_errors
         stdout = args.stdout
         sparse = args.sparse
         strip_components = args.strip_components
@@ -420,8 +421,10 @@ def item_is_hardlink_master(item):
                         dirs.append(item)
                         archive.extract_item(item, restore_attrs=False)
                     else:
-                        archive.extract_item(item, stdout=stdout, sparse=sparse, hardlink_masters=hardlink_masters,
-                                             original_path=orig_path)
+                        if not archive.extract_item(item, stdout=stdout, sparse=sparse,
+                                                    hardlink_masters=hardlink_masters, original_path=orig_path,
+                                                    skip_integrity_errors=skip_errors):
+                            self.exit_code = EXIT_WARNING
             except OSError as e:
                 self.print_warning('%s: %s', remove_surrogates(orig_path), e)
 
@@ -1359,6 +1362,10 @@ def build_parser(self, args=None, prog=None):
         subparser.add_argument('--sparse', dest='sparse',
                                action='store_true', default=False,
                                help='create holes in output sparse file from all-zero chunks')
+        subparser.add_argument('--skip-errors', dest='skip_errors',
+                               action='store_true', default=False,
+                               help='skip corrupted chunks with a log message (exit 1) instead of aborting (no effect '
+                                    'for --dry-run and --stdout)')
         subparser.add_argument('location', metavar='ARCHIVE',
                                type=location_validator(archive=True),
                                help='archive to extract')

diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py
@@ -776,6 +776,22 @@ def test_overwrite(self):
         with changedir('output'):
             self.cmd('extract', self.repository_location + '::test', exit_code=1)
 
+    def test_extract_ignore_error(self):
+        with open(os.path.join(self.input_path, 'file1'), 'wb') as fd:
+            fd.write(b'a' * 280)
+            fd.write(b'b' * 280)
+        self.cmd('init', self.repository_location)
+        self.cmd('create', '--chunker-params', '7,9,8,128', self.repository_location + '::test', 'input')
+        name = sorted(os.listdir(os.path.join(self.tmpdir, 'repository', 'data', '0')), reverse=True)[0]
+        with open(os.path.join(self.tmpdir, 'repository', 'data', '0', name), 'r+b') as fd:
+            fd.seek(100)
+            fd.write(b'XXXX')
+        with changedir('output'):
+            output = self.cmd('extract', '--skip-errors', self.repository_location + '::test', exit_code=1)
+            assert 'input/file1: chunk' in output
+            assert os.stat('input/file1').st_size == 560
+        self.cmd('check', self.repository_location, exit_code=1)
+
     def test_rename(self):
         self.create_regular_file('file1', size=1024 * 80)
         self.create_regular_file('dir2/file2', size=1024 * 80)