From 9d5c1c9f203a207da8e5b515259850c17a3893bb Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Thu, 27 Oct 2022 19:03:44 +0200 Subject: [PATCH 01/12] Prevent duplicate or overlapping blocks (DIS-1134) --- dissect/evidence/asdf/asdf.py | 361 ++++++++++++++++++-------- dissect/evidence/asdf/streams.py | 61 +++-- dissect/evidence/tools/asdf/dd.py | 53 ++-- dissect/evidence/tools/asdf/verify.py | 46 ++-- tests/test_asdf.py | 146 +++++++++++ 5 files changed, 487 insertions(+), 180 deletions(-) create mode 100644 tests/test_asdf.py diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index ec8e0d1..eb781ba 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -1,22 +1,26 @@ # Acquire Snapshot Data Format -from __future__ import print_function +from __future__ import annotations -import io import gzip -import uuid +import io import shutil import tarfile +import uuid from bisect import bisect_right -from zlib import crc32 from collections import defaultdict +from typing import BinaryIO, Callable, Iterator, Optional, Tuple from dissect import cstruct from dissect.util import ts from dissect.util.stream import AlignedStream, RangeStream -from dissect.evidence.exceptions import InvalidSnapshot, UnsupportedVersion, InvalidBlock -from dissect.evidence.asdf.streams import HashedStream, CompressedStream, SubStreamBase +from dissect.evidence.asdf.streams import CompressedStream, Crc32Stream, HashedStream +from dissect.evidence.exceptions import ( + InvalidBlock, + InvalidSnapshot, + UnsupportedVersion, +) VERSION = 1 DEFAULT_BLOCK_SIZE = 4096 @@ -40,7 +44,6 @@ flag BLOCK_FLAG : uint8 { CRC32 = 0x01, COMPRESS = 0x02, - SHADOW = 0x04, }; struct header { @@ -58,18 +61,18 @@ BLOCK_FLAG flags; // Block flags uint8 idx; // Stream index, some reserved values have special meaning char reserved[2]; // Reserved - uint64 offset; // Absolute disk offset - uint64 size; // Size of block (on disk, not in file) + uint64 offset; // Absolute offset of block in stream + uint64 size; // Size of block in stream }; -struct table_entry = { +struct table_entry { BLOCK_FLAG flags; // Block flags uint8 idx; // Stream index, some reserved values have special meaning char reserved[2]; // Reserved - uint64 offset; // Absolute disk offset - uint64 size; // Size of block (on disk, not in file) - uint64 file_offset; // Offset in file to this block - uint64 file_size; // Size of block in this file + uint64 offset; // Absolute offset of block in stream + uint64 size; // Size of block in stream + uint64 file_offset; // Absolute offset of block in file + uint64 file_size; // Size of block in file }; struct footer { @@ -90,24 +93,23 @@ class AsdfWriter(io.RawIOBase): - Maximum source disk size is ~16EiB - Maximum number of disks is 254 - There's no cleverness here. Just writing blocks. We don't sort/"defrag" - or prevent dupes on purpose. This is to make the process of writing - these files as "lightweight" as possible. The decision to offload all - heavy lifting to the readers is because writers are usually low power - clients, whereas the readers are usually high power servers. - Some things are currently hardcoded (like SHA256), although they may become variable in the future. Args: fh: File-like object to write to. guid: Unique identifier. Used to link images to writers. - block_size: The block size to use for storing data. + compress: Write gzip compressed file. block_crc: Flag to store a CRC32 after each block. - block_compress: Flag to compress blocks using zlib. """ - def __init__(self, fh, guid=None, compress=False, block_crc=True): + def __init__( + self, + fh: BinaryIO, + guid: uuid.UUID = None, + compress: bool = False, + block_crc: bool = True, + ): self._fh = fh self.fh = self._fh @@ -119,9 +121,10 @@ def __init__(self, fh, guid=None, compress=False, block_crc=True): # Options self.block_crc = block_crc - self.block_compress = False # Hard code this for now + self.block_compress = False # Disabled for now - self._table = [] + self._table = defaultdict(list) + self._table_lookup = defaultdict(list) self._table_offset = 0 self._meta_buf = io.BytesIO() @@ -129,7 +132,14 @@ def __init__(self, fh, guid=None, compress=False, block_crc=True): self._write_header() - def add_metadata(self, path, fh, size=None): + def add_metadata(self, path: str, fh: BinaryIO, size: Optional[int] = None) -> None: + """Add a file to the metadata. + + Args: + path: The path in the metadata tar to write to. + fh: The file-like object to write. + size: Optional size to write. + """ info = self._meta_tar.tarinfo() info.name = path info.uname = "root" @@ -144,13 +154,23 @@ def add_metadata(self, path, fh, size=None): fh.seek(0) self._meta_tar.addfile(info, fh) - def copy_bytes(self, source, offset, num_bytes, idx=0, base=0): + def add_bytes(self, data: bytes, idx: int = 0, base: int = 0) -> None: + """Copy some bytes into this snapshot. + + Args: + data: The bytes to copy. + idx: The stream index. + base: The base offset. + """ + self._write_block(io.BytesIO(data), 0, len(data), idx=idx, base=base) + + def copy_bytes(self, source: BinaryIO, offset: int, num_bytes: int, idx: int = 0, base: int = 0) -> None: """Copy some bytes from the source file-like object into this snapshot. Often the source will be a volume on a disk, which is usually represented - as a relative stream. If this is the case, use the `base` argument to + as a relative stream. If this is the case, use the ``base`` argument to indicate what the byte offset of the source is, relative to the start - of the disk. The `offset` argument is always the offset in the + of the disk. The ``offset`` argument is always the offset in the source, so that is not affected. Args: @@ -158,16 +178,24 @@ def copy_bytes(self, source, offset, num_bytes, idx=0, base=0): offset: The byte offset into the source to start copying bytes from. num_bytes: The amount of bytes to copy. idx: The stream index, if copying from multiple disks. - base: The base offset, if the source is a relative stream from a disk. + base: The base offset, if the source is a relative stream from e.g. a disk. """ self._write_block(source, offset, num_bytes, idx=idx, base=base) - def copy_block(self, source, offset, num_blocks, block_size=None, idx=0, base=0): + def copy_block( + self, + source: BinaryIO, + offset: int, + num_blocks: int, + block_size: Optional[int] = None, + idx: int = 0, + base: int = 0, + ) -> None: """Copy some blocks in the given block size into this snapshot. If no block size is given, the ASDF native block size is used. This is really just a convenience method that does the block multiplication - before calling `copy_bytes`. + before calling ``copy_bytes``. Args: source: The source file-like object to copy the blocks from. @@ -175,12 +203,19 @@ def copy_block(self, source, offset, num_blocks, block_size=None, idx=0, base=0) num_blocks: The amount of blocks to copy. block_size: The size of each block. idx: The stream index, if copying from multiple disks. - base: The base offset, if the source is a relative stream from a disk. + base: The base offset, if the source is a relative stream from e.g. a disk. """ block_size = block_size or DEFAULT_BLOCK_SIZE return self.copy_bytes(source, offset, num_blocks * block_size, idx, base) - def copy_runlist(self, source, runlist, runlist_block_size, idx=0, base=0): + def copy_runlist( + self, + source: BinaryIO, + runlist: list[Tuple[Optional[int], int]], + runlist_block_size: int, + idx: int = 0, + base: int = 0, + ) -> None: """Copy a runlist of blocks in the given block size into this snapshot. A runlist must be a list of tuples, where: @@ -194,7 +229,7 @@ def copy_runlist(self, source, runlist, runlist_block_size, idx=0, base=0): runlist: The runlist that describes the blocks. runlist_block_size: The size of each block. idx: The stream index, if copying from multiple disks. - base: The base offset, if the source is a relative stream from a disk. + base: The base offset, if the source is a relative stream from e.g. a disk. """ for run_offset, run_length in runlist: # If run_offset is None, it's a sparse run @@ -204,7 +239,7 @@ def copy_runlist(self, source, runlist, runlist_block_size, idx=0, base=0): # Save a function call by directly calling copy_bytes instead of copy_block. self.copy_bytes(source, run_offset * runlist_block_size, run_length * runlist_block_size, idx, base) - def close(self): + def close(self) -> None: """Close the ASDF file. Writes the block table and footer, then closes the destination file-like object. @@ -216,7 +251,7 @@ def close(self): self._write_footer() self.fh.close() - def _write_header(self): + def _write_header(self) -> None: """Write the ASDF header to the destination file-like object.""" header = c_asdf.header( magic=FILE_MAGIC, @@ -227,7 +262,7 @@ def _write_header(self): ) header.write(self.fh) - def _write_block(self, source, offset, size, idx=0, base=0): + def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, base: int = 0) -> None: """Write an ASDF block to the destination file-like object. Args: @@ -237,6 +272,20 @@ def _write_block(self, source, offset, size, idx=0, base=0): idx: The stream index, if copying from multiple disks. base: The base offset, if the source is a relative stream from a disk. """ + absolute_offset = base + offset + + lookup_table = self._table_lookup[idx] + entry_table = self._table[idx] + + table_idx, absolute_offset, size = _table_fit( + absolute_offset, size, entry_table, lookup_table, lambda e: (e[2], e[3]) + ) + + if table_idx is None: + return + + offset = absolute_offset - base + # Setup the block flags and block writer flags = 0 outfh = self.fh @@ -248,7 +297,6 @@ def _write_block(self, source, offset, size, idx=0, base=0): flags |= c_asdf.BLOCK_FLAG.COMPRESS block_offset = self.fh.tell() # Block header location - absolute_offset = base + offset block = c_asdf.block( magic=BLOCK_MAGIC, flags=flags, @@ -265,30 +313,34 @@ def _write_block(self, source, offset, size, idx=0, base=0): outfh.finalize() data_size = self.fh.tell() - data_offset - self._table.append((flags, idx, absolute_offset, size, block_offset, data_size)) - def _write_meta(self): + lookup_table.insert(table_idx, absolute_offset) + entry_table.insert(table_idx, (flags, idx, absolute_offset, size, block_offset, data_size)) + + def _write_meta(self) -> None: + """Write the metadata tar to the destination file-like object.""" self._meta_tar.close() size = self._meta_buf.tell() self._meta_buf.seek(0) self.copy_bytes(self._meta_buf, 0, size, idx=IDX_METADATA) - def _write_table(self): + def _write_table(self) -> None: """Write the ASDF block table to the destination file-like object.""" self._table_offset = self.fh.tell() - for flags, idx, offset, size, file_offset, file_size in self._table: - table_entry = c_asdf.table_entry( - flags=flags, - idx=idx, - offset=offset, - size=size, - file_offset=file_offset, - file_size=file_size, - ) - table_entry.write(self.fh) - - def _write_footer(self): + for stream_table in self._table.values(): + for flags, idx, offset, size, file_offset, file_size in stream_table: + table_entry = c_asdf.table_entry( + flags=flags, + idx=idx, + offset=offset, + size=size, + file_offset=file_offset, + file_size=file_size, + ) + table_entry.write(self.fh) + + def _write_footer(self) -> None: """Write the ASDF footer to the destination file-like object.""" footer = c_asdf.footer( magic=FOOTER_MAGIC, @@ -305,7 +357,7 @@ class AsdfSnapshot: fh: File-like object to read the ASDF file from. """ - def __init__(self, fh): + def __init__(self, fh: BinaryIO): self.fh = fh self.header = c_asdf.header(fh) if self.header.magic != FILE_MAGIC: @@ -314,32 +366,63 @@ def __init__(self, fh): if self.header.version > VERSION: raise UnsupportedVersion("higher version") + self.timestamp = ts.from_unix(self.header.timestamp) + self.guid = uuid.UUID(bytes_le=self.header.guid) + + self.table: dict[list[Tuple[int, int, int, int]]] = defaultdict(list) + self._table_lookup: dict[list[int]] = defaultdict(list) + self.fh.seek(-len(c_asdf.footer), io.SEEK_END) - footer_offset = self.fh.tell() + self.footer_offset = self.fh.tell() self.footer = c_asdf.footer(self.fh) if self.footer.magic != FOOTER_MAGIC: raise InvalidSnapshot("invalid footer magic") - self.timestamp = ts.from_unix(self.header.timestamp) - self.guid = uuid.UUID(bytes_le=self.header.guid) - self.table = defaultdict(list) - self._table_lookup = defaultdict(list) + self.metadata = Metadata(self) + def _parse_block_table(self): + """Parse the block table, getting rid of overlapping blocks.""" table_offset = self.footer.table_offset - table_size = (footer_offset - table_offset) // len(c_asdf.table_entry) + table_size = self.footer_offset - table_offset + table_count = table_size // len(c_asdf.table_entry) self.fh.seek(table_offset) - for _ in range(table_size): - entry = c_asdf.table_entry(self.fh) - stream_idx = entry.idx - lookup_idx = bisect_right(self._table_lookup[stream_idx], entry.offset) - self._table_lookup[stream_idx].insert(lookup_idx, entry.offset) - self.table[stream_idx].insert(lookup_idx, entry) + table_data = io.BytesIO(self.fh.read(table_size)) - self.metadata = Metadata(self) + for _ in range(table_count): + entry = c_asdf.table_entry(table_data) + self._table_insert(entry.idx, entry.offset, entry.size, entry.file_offset) + + + def _table_insert(self, idx: int, offset: int, size: int, file_offset: int): + stream_idx = idx + entry_data_offset = file_offset + len(c_asdf.block) + + lookup_table = self._table_lookup[stream_idx] + entry_table = self.table[stream_idx] - def contains(self, idx): + table_idx, entry_offset, entry_size = _table_fit( + offset, size, entry_table, lookup_table, lambda e: (e[0], e[1]) + ) + + if table_idx is None: + return + + entry_data_offset += entry_offset - offset + + lookup_table.insert(table_idx, entry_offset) + entry_table.insert( + table_idx, + ( + entry_offset, + entry_size, + file_offset, + entry_data_offset, + ), + ) + + def contains(self, idx: int) -> bool: """Check whether this file contains the given stream index. Args: @@ -347,7 +430,7 @@ def contains(self, idx): """ return idx in self.table - def open(self, idx): + def open(self, idx: int) -> AsdfStream: """Open a specific stream in the file. Args: @@ -357,12 +440,12 @@ def open(self, idx): raise IndexError(f"invalid stream idx: {idx}") return AsdfStream(self, idx) - def streams(self): + def streams(self) -> AsdfStream: """Iterate over all streams in the file.""" for i in sorted(self.table.keys()): yield self.open(i) - def disks(self): + def disks(self) -> AsdfStream: """Iterate over all non-reserved streams in the file.""" for i in sorted(self.table.keys()): if i in RESERVED_IDX: @@ -371,32 +454,43 @@ def disks(self): class Metadata: - def __init__(self, asdf): + """ASDF metadata reader. + + Thin wrapper around ``tarfile``. + + Args: + asdf: The :class:`AsdfSnapshot` to open the metadata of. + """ + + def __init__(self, asdf: AsdfSnapshot): self.tar = None if IDX_METADATA in asdf.table: self.tar = tarfile.open(fileobj=asdf.open(IDX_METADATA), mode="r") - def names(self): + def names(self) -> list[str]: + """Return all metadata file entries.""" return self.tar.getnames() if self.tar else [] - def members(self): + def members(self) -> list[tarfile.TarInfo]: + """Return all metadata :class:`tarfile.TarInfo` entries.""" return self.tar.getmembers() if self.tar else [] - def open(self, path): + def open(self, path: str) -> BinaryIO: + """Open a metadata entry and return a binary file-like object.""" if self.tar: return self.tar.extractfile(path) raise KeyError(f"filename '{path}' not found") class AsdfStream(AlignedStream): - """Asdf stream from a snapshot. + """ASDF stream from a snapshot. Args: - asdf: AsdfFile parent object. - idx: Stream index in the AsdfFile. + asdf: :class:`AsdfSnapshot` parent object. + idx: Stream index in the :class:`AsdfSnapshot`. """ - def __init__(self, asdf, idx): + def __init__(self, asdf: AsdfSnapshot, idx: int): self.fh = asdf.fh self.asdf = asdf self.idx = idx @@ -405,10 +499,10 @@ def __init__(self, asdf, idx): # We don't actually know the size of the source disk # Doesn't really matter though, just take the last run offset + size - size = self.table[-1].offset + self.table[-1].size + size = self.table[-1][0] + self.table[-1][1] super().__init__(size) - def _read(self, offset, length): + def _read(self, offset: int, length: int): r = [] size = self.size @@ -416,19 +510,20 @@ def _read(self, offset, length): runlist_len = len(self.table) while length > 0 and run_idx < len(self.table): + run_start, run_size, run_file_offset, run_data_offset = self.table[run_idx] + run_end = run_start + run_size - run = self.table[run_idx] - next_run = self.table[run_idx + 1] if run_idx + 1 < runlist_len else None - - run_start = run.offset - run_end = run_start + run.size + if run_idx + 1 < runlist_len: + next_run_start, _, _, _ = self.table[run_idx + 1] + else: + next_run_start = None if run_idx < 0: # Missing first block - if not next_run: + if next_run_start is None: break - sparse_remaining = next_run.offset - offset + sparse_remaining = next_run_start - offset read_count = min(size - offset, min(sparse_remaining, length)) r.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) @@ -437,7 +532,11 @@ def _read(self, offset, length): run_idx += 1 elif run_end <= offset: # Start outside of run bounds - sparse_size = next_run.offset - run_end + if next_run_start is None: + # No next run to sparse read to + break + + sparse_size = next_run_start - run_end sparse_pos = offset - run_end sparse_remaining = sparse_size - sparse_pos @@ -451,19 +550,23 @@ def _read(self, offset, length): sparse_remaining = run_start - offset read_count = min(size - offset, min(sparse_remaining, length)) r.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) + + # Don't proceed to next run, next loop iteration we'll be within the current run else: + # We're in a run with data run_pos = offset - run_start - run_remaining = run.size - run_pos + run_remaining = run_size - run_pos read_count = min(size - offset, min(run_remaining, length)) - self.fh.seek(run.file_offset) - block_header = c_asdf.block(self.fh) - if block_header.magic != BLOCK_MAGIC: + self.fh.seek(run_file_offset) + if self.fh.read(4) != BLOCK_MAGIC: raise InvalidBlock("invalid block magic") - self.fh.seek(run_pos, io.SEEK_CUR) + # Skip over block header + self.fh.seek(run_data_offset) r.append(self.fh.read(read_count)) + # Proceed to next run run_idx += 1 offset += read_count @@ -472,26 +575,56 @@ def _read(self, offset, length): return b"".join(r) -class Crc32Stream(SubStreamBase): - """Compute a CRC32 over all written data. +def _table_fit(entry_offset: int, entry_size: int, entry_table: list, lookup_table: list, getentry: Callable): + """Calculate where to insert an entry with the given offset and size into the entry table. - This assumes that all data is written as a continuous stream. + Moves or shrinks the entry to prevent block overlap, and remove any overlapping blocks. Args: - fh: The file-like object to wrap. + entry_offset: The entry offset to calculate the insert for. + entry_size: The entry size to calculate the insert for. + entry_table: The entry table to insert into or remove entries from. + lookup_table: The lookup table for the entry_table. + getentry: A callable to return the ``(offset, size)`` tuple from an entry. + + Returns: + A tuple of the table index to insert into, an adjusted entry offset and an adjusted entry size. """ - - def __init__(self, fh): - super().__init__(fh) - self.crc = 0 - - def write(self, b): - self.crc = crc32(b, self.crc) & 0xFFFFFFFF - return self.fh.write(b) - - def digest(self): - return self.crc - - def finalize(self): - c_asdf.uint32.write(self.fh, self.digest()) - self.fh.finalize() + entry_end = entry_offset + entry_size + + prev_end = None + next_start = None + next_end = None + + table_idx = bisect_right(lookup_table, entry_offset) + if table_idx > 0: + prev_start, prev_size = getentry(entry_table[table_idx - 1]) + prev_end = prev_start + prev_size + if table_idx < len(lookup_table): + next_start, next_size = getentry(entry_table[table_idx]) + next_end = next_start + next_size + + if prev_end and prev_end >= entry_end: + # This block is fully contained in the previous block + return None, None, None + + if prev_end and prev_end > entry_offset: + # The start of this block overlaps with the previous, so shrink this block + entry_offset = prev_end + + # We may completely overlap one or more next entries + while next_end and next_end <= entry_end: + lookup_table.pop(table_idx) + entry_table.pop(table_idx) + + if table_idx < len(lookup_table): + next_start, next_size = getentry(entry_table[table_idx]) + next_end = next_start + next_size + else: + next_start, next_end = None, None + + if next_start and next_start < entry_end < next_end: + # The next block overlaps with this block, so shrink this block + entry_end = next_start + + return table_idx, entry_offset, entry_end - entry_offset diff --git a/dissect/evidence/asdf/streams.py b/dissect/evidence/asdf/streams.py index bab033e..4bd1fb2 100644 --- a/dissect/evidence/asdf/streams.py +++ b/dissect/evidence/asdf/streams.py @@ -1,6 +1,8 @@ import hashlib import io -from zlib import compressobj +import struct +from typing import BinaryIO +from zlib import compressobj, crc32 class SubStreamBase(io.RawIOBase): @@ -12,23 +14,50 @@ class SubStreamBase(io.RawIOBase): fh: The file-like object to wrap. """ - def __init__(self, fh): + def __init__(self, fh: BinaryIO): self.fh = fh - def write(self, b): + def write(self, b: bytes) -> int: return self.fh.write(b) - def tell(self): + def tell(self) -> int: return self.fh.tell() - def seek(self, pos, whence=io.SEEK_CUR): + def seek(self, pos: int, whence: int = io.SEEK_CUR) -> int: return self.fh.seek(pos, whence) - def close(self): + def close(self) -> None: super().close() - def finalize(self): + def finalize(self) -> None: self.fh.flush() + if hasattr(self.fh, "finalize"): + self.fh.finalize() + + +class Crc32Stream(SubStreamBase): + """Compute a CRC32 over all written data. + + This assumes that all data is written as a continuous stream. + + Args: + fh: The file-like object to wrap. + """ + + def __init__(self, fh: BinaryIO): + super().__init__(fh) + self.crc = 0 + + def write(self, b: bytes) -> int: + self.crc = crc32(b, self.crc) & 0xFFFFFFFF + return self.fh.write(b) + + def digest(self) -> bytes: + return struct.pack(" None: + self.fh.write(self.digest()) + super().finalize() class HashedStream(SubStreamBase): @@ -41,21 +70,21 @@ class HashedStream(SubStreamBase): alg: The hashing algorithm to use. Must be supported by hashlib. """ - def __init__(self, fh, alg="sha256"): + def __init__(self, fh: BinaryIO, alg: str = "sha256"): super().__init__(fh) self.ctx = hashlib.new(alg) - def write(self, b): + def write(self, b: bytes) -> int: self.ctx.update(b) return self.fh.write(b) - def digest(self): + def digest(self) -> bytes: return self.ctx.digest() - def hexdigest(self): + def hexdigest(self) -> str: return self.ctx.hexdigest() - def close(self): + def close(self) -> None: super().close() self.fh.close() @@ -69,13 +98,13 @@ class CompressedStream(SubStreamBase): fh: The file-like object to wrap. """ - def __init__(self, fh): + def __init__(self, fh: BinaryIO): super().__init__(fh) self.cobj = compressobj() - def write(self, b): + def write(self, b: bytes) -> int: return self.fh.write(self.cobj.compress(b)) - def finalize(self): + def finalize(self) -> None: self.fh.write(self.cobj.flush()) - self.fh.finalize() + super().finalize() diff --git a/dissect/evidence/tools/asdf/dd.py b/dissect/evidence/tools/asdf/dd.py index 15d7750..6c668d4 100644 --- a/dissect/evidence/tools/asdf/dd.py +++ b/dissect/evidence/tools/asdf/dd.py @@ -1,6 +1,7 @@ +import argparse import io import sys -import argparse +from typing import BinaryIO from dissect.evidence.asdf import asdf @@ -13,44 +14,45 @@ class Progress: - def __init__(self, size): + def __init__(self, size: int): self.size = size if HAS_TQDM: self.t = tqdm(total=size, unit="B", unit_scale=True) - def update(self, offset): + def update(self, offset: int) -> None: if HAS_TQDM: self.t.update(offset - self.t.n) else: sys.stderr.write(f"\r{offset / float(self.size) * 100:0.2f}%") sys.stderr.flush() - def close(self): + def close(self) -> None: if HAS_TQDM: self.t.close() -def copystream(fhin, fhout, length): +def copy_stream(fhin: BinaryIO, fhout: BinaryIO, length: int) -> None: n, remain = divmod(length, io.DEFAULT_BUFFER_SIZE) for _ in range(n): fhout.write(fhin.read(io.DEFAULT_BUFFER_SIZE)) fhout.write(fhin.read(remain)) +def fill_zero(fhout: BinaryIO, length: int) -> None: + n, remain = divmod(length, io.DEFAULT_BUFFER_SIZE) + for _ in range(n): + fhout.write(b"\x00" * io.DEFAULT_BUFFER_SIZE) + fhout.write(b"\x00" * remain) + + def main(): parser = argparse.ArgumentParser(description="Utility to dump ASDF streams") parser.add_argument("file", metavar="ASDF", help="ASDF file to dd") parser.add_argument("-w", "--writer", default="-", help="file to write to, default is stdout") parser.add_argument("-s", "--stream", type=int, default=0, help="stream index to dump (0-255)") - parser.add_argument( - "--fast", action="store_true", default=False, help="dump fast, fill sparse with null bytes instead" - ) parser.add_argument("--no-tqdm", action="store_true", default=False, help="disable tqdm progress bar") args = parser.parse_args() - if args.fast and args.writer == "-": - parser.exit("--fast is not supported when writing to stdout") - if args.no_tqdm: global HAS_TQDM HAS_TQDM = False @@ -60,7 +62,7 @@ def main(): if args.stream > 255 or not snapshot.contains(args.stream): parser.print_help() - print() + print(file=sys.stderr) valid_keys = ", ".join(str(i) for i in snapshot.table.keys()) parser.exit(f"invalid stream index, must be one of {valid_keys}") @@ -74,22 +76,17 @@ def main(): fhout = open(args.writer, "wb") try: - if args.fast: - for entry in stream.table: - stream.fh.seek(entry.file_offset + 24) - fhout.seek(entry.offset) - copystream(stream.fh, fhout, entry.size) - - progress.update(entry.offset) - else: - offset = 0 - while True: - buf = stream.read(io.DEFAULT_BUFFER_SIZE * 8) - if not buf: - break - fhout.write(buf) - progress.update(offset) - offset += len(buf) + prev_offset = 0 + for offset, size, _, data_offset in stream.table: + stream.fh.seek(data_offset) + if fhout.seekable(): + fhout.seek(offset) + else: + fill_zero(fhout, offset - prev_offset) + copy_stream(stream.fh, fhout, size) + + progress.update(offset) + prev_offset = offset except BrokenPipeError: pass finally: diff --git a/dissect/evidence/tools/asdf/verify.py b/dissect/evidence/tools/asdf/verify.py index 43c442d..4d4a2a4 100644 --- a/dissect/evidence/tools/asdf/verify.py +++ b/dissect/evidence/tools/asdf/verify.py @@ -1,49 +1,50 @@ from __future__ import print_function + +import argparse +import hashlib import io import sys -import hashlib -import argparse import traceback -from zlib import crc32 from contextlib import contextmanager +from typing import BinaryIO, Iterator +from zlib import crc32 from dissect.util.stream import RangeStream from dissect.evidence.asdf import asdf -def iterfileobj(src): +def iter_fileobj(src: BinaryIO) -> Iterator[bytes]: buf = src.read(io.DEFAULT_BUFFER_SIZE) while buf: yield buf buf = src.read(io.DEFAULT_BUFFER_SIZE) -def hashfileobj(src, alg="sha256"): +def hash_fileobj(src: BinaryIO, alg: str = "sha256") -> bytes: ctx = hashlib.new(alg) - for buf in iterfileobj(src): + for buf in iter_fileobj(src): ctx.update(buf) return ctx.digest() -def crc32filobj(src): +def crc32_filobj(src: BinaryIO) -> int: crc = 0 - for buf in iterfileobj(src): + for buf in iter_fileobj(src): crc = crc32(buf, crc) & 0xFFFFFFFF return crc @contextmanager -def status(line): +def status(line: str, verbose: bool = False) -> Iterator: TEMPLATE = "[{}] {:<50}{}" try: print(TEMPLATE.format("*", line, ""), end="\n") sys.stdout.flush() yield - # print(TEMPLATE.format('*', line, '[OK]'), end='\n') except Exception as e: - traceback.print_exc() - # print(TEMPLATE.format('!', line, '[ERR]'), end='\n') + if verbose: + traceback.print_exc() print(f"[!] {e}") @@ -60,29 +61,30 @@ def main(): footer = None footer_offset = 0 - with status("Checking header"): + with status("Checking header", args.verbose): header = asdf.c_asdf.header(fh) - if header.magic != asdf.MAGIC: + if header.magic != asdf.FILE_MAGIC: raise Exception("invalid header magic") - with status("Checking footer"): + with status("Checking footer", args.verbose): fh.seek(-len(asdf.c_asdf.footer), io.SEEK_END) footer_offset = fh.tell() footer = asdf.c_asdf.footer(fh) if footer.magic != asdf.FOOTER_MAGIC: - raise Exception("invalid footer magic") + footer = None + raise Exception("invalid footer magic, please run asdf-repair") - if not args.skip_hash: - with status("Checking file hash"): + if not args.skip_hash and footer: + with status("Checking file hash", args.verbose): hashstream = RangeStream(fh, 0, footer_offset) - res = hashfileobj(hashstream) + res = hash_fileobj(hashstream) if res != footer.sha256: raise Exception("file hash doesn't match") else: print("[!] Skipping file hash") - if not args.skip_blocks: - with status("Checking blocks"): + if not args.skip_blocks and footer: + with status("Checking blocks", args.verbose): table_size = (footer_offset - footer.table_offset) // len(asdf.c_asdf.table_entry) fh.seek(footer.table_offset) table = asdf.c_asdf.table_entry[table_size](fh) @@ -100,7 +102,7 @@ def main(): target_crc = asdf.c_asdf.uint32(fh) block_fh = RangeStream(fh, data_offset, entry.file_size - 4) - crc = crc32filobj(block_fh) + crc = crc32_filobj(block_fh) if crc != target_crc: print(f"[!] Block {i} crc32 doesn't match. Expected 0x{target_crc:x}, got 0x{crc:x}") diff --git a/tests/test_asdf.py b/tests/test_asdf.py new file mode 100644 index 0000000..076c998 --- /dev/null +++ b/tests/test_asdf.py @@ -0,0 +1,146 @@ +from io import BytesIO + +import pytest + +from dissect.evidence.asdf.asdf import AsdfSnapshot, AsdfWriter + + +def noop(): + pass + + +def test_asdf(): + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + + writer = AsdfWriter(fh) + + writer.add_bytes(b"\x00" * 0x1000, idx=0, base=0) + writer.add_bytes(b"\x02" * 0x1000, idx=0, base=0x4000) + writer.add_bytes(b"\x04" * 0x1000, idx=0, base=0x8000) + writer.add_bytes(b"\x06" * 0x1000, idx=0, base=0x10000) + writer.add_bytes(b"\xff" * 0x1000, idx=0, base=0x14000) + + writer.add_bytes(b"\x08" * 0x1000, idx=1, base=0x2000) + writer.add_bytes(b"\x10" * 0x1000, idx=1, base=0x5000) + writer.add_bytes(b"\x12" * 0x1000, idx=1, base=0x8000) + writer.add_bytes(b"\x14" * 0x1000, idx=1, base=0xB000) + writer.add_bytes(b"\xff" * 0x1000, idx=1, base=0xE000) + + writer.close() + fh.seek(0) + + reader = AsdfSnapshot(fh) + stream_0 = reader.open(0) + assert [(run_start, run_size) for run_start, run_size, _, _ in stream_0.table] == [ + (0, 0x1000), + (0x4000, 0x1000), + (0x8000, 0x1000), + (0x10000, 0x1000), + (0x14000, 0x1000), + ] + + assert stream_0.read(0x1000) == b"\x00" * 0x1000 + assert stream_0.read(0x1000) == (b"\xa5\xdf" * (0x1000 // 2)) + assert stream_0.read(0x4000) == (b"\xa5\xdf" * (0x2000 // 2)) + (b"\x02" * 0x1000) + (b"\xa5\xdf" * (0x1000 // 2)) + + stream_0.seek(0) + assert stream_0.read() == b"".join( + [ + (b"\x00" * 0x1000), + (b"\xa5\xdf" * (0x3000 // 2)), + (b"\x02" * 0x1000), + (b"\xa5\xdf" * (0x3000 // 2)), + (b"\x04" * 0x1000), + (b"\xa5\xdf" * (0x7000 // 2)), + (b"\x06" * 0x1000), + (b"\xa5\xdf" * (0x3000 // 2)), + (b"\xff" * 0x1000), + ] + ) + + stream_1 = reader.open(1) + assert stream_1.read(0x4000) == (b"\xa5\xdf" * (0x2000 // 2)) + (b"\x08" * 0x1000) + (b"\xa5\xdf" * (0x1000 // 2)) + + +def test_asdf_overlap(): + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + + writer = AsdfWriter(fh) + + writer.add_bytes(b"\x01" * 100, base=0) + writer.add_bytes(b"\x02" * 100, base=200) + assert writer._table_lookup[0] == [0, 200] + + writer.add_bytes(b"\x03" * 100, base=50) + assert writer._table_lookup[0] == [0, 100, 200] + + writer.add_bytes(b"\x04" * 150, base=100) + assert writer._table_lookup[0] == [0, 100, 150, 200] + + writer.add_bytes(b"\x05" * 50, base=25) + assert writer._table_lookup[0] == [0, 100, 150, 200] + + writer.close() + fh.seek(0) + + reader = AsdfSnapshot(fh) + stream = reader.open(0) + + assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + (0, 100), + (100, 50), + (150, 50), + (200, 100), + ] + assert stream.read() == (b"\x01" * 100) + (b"\x03" * 50) + (b"\x04" * 50) + (b"\x02" * 100) + + +def test_asdf_overlap_all(): + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + + writer = AsdfWriter(fh) + + writer.add_bytes(b"\x01" * 100, base=0) + writer.add_bytes(b"\x02" * 100, base=200) + writer.add_bytes(b"\x03" * 100, base=50) + writer.add_bytes(b"\x04" * 150, base=100) + assert writer._table_lookup[0] == [0, 100, 150, 200] + writer.add_bytes(b"\x06" * 400, base=0) + assert writer._table_lookup[0] == [0, 100] + + writer.close() + fh.seek(0) + + reader = AsdfSnapshot(fh) + stream = reader.open(0) + + assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + (0, 100), + (100, 300), + ] + assert stream.read() == (b"\x01" * 100) + (b"\x06" * 300) + + +def test_asdf_metadata(): + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + + writer = AsdfWriter(fh) + + writer.add_metadata("file", BytesIO(b"content")) + writer.add_metadata("dir/file", BytesIO(b"content here too")) + + writer.close() + fh.seek(0) + + reader = AsdfSnapshot(fh) + + assert reader.metadata.names() == ["file", "dir/file"] + assert reader.metadata.open("file").read() == b"content" + assert reader.metadata.open("dir/file").read() == b"content here too" + + with pytest.raises(KeyError): + reader.metadata.open("nonexistent") From 40c7f272dc0cc914858a5cdd2f430e9d32dd6b20 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Thu, 27 Oct 2022 19:08:23 +0200 Subject: [PATCH 02/12] Fix tests --- dissect/evidence/asdf/asdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index eb781ba..3b11071 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -379,6 +379,8 @@ def __init__(self, fh: BinaryIO): if self.footer.magic != FOOTER_MAGIC: raise InvalidSnapshot("invalid footer magic") + self._parse_block_table() + self.metadata = Metadata(self) def _parse_block_table(self): @@ -394,7 +396,6 @@ def _parse_block_table(self): entry = c_asdf.table_entry(table_data) self._table_insert(entry.idx, entry.offset, entry.size, entry.file_offset) - def _table_insert(self, idx: int, offset: int, size: int, file_offset: int): stream_idx = idx entry_data_offset = file_offset + len(c_asdf.block) From a76d44e4d1b50182e504f97af225003f8e65aa2e Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Thu, 27 Oct 2022 19:10:18 +0200 Subject: [PATCH 03/12] Fix linting --- dissect/evidence/asdf/asdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 3b11071..ab8250d 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -9,7 +9,7 @@ import uuid from bisect import bisect_right from collections import defaultdict -from typing import BinaryIO, Callable, Iterator, Optional, Tuple +from typing import BinaryIO, Callable, Optional, Tuple from dissect import cstruct from dissect.util import ts From 69e3c1284b117b11984ee4afa36862150e799d09 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Fri, 28 Oct 2022 17:28:50 +0200 Subject: [PATCH 04/12] Add unit tests for streams --- tests/test_asdf.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_asdf.py b/tests/test_asdf.py index 076c998..e3ec692 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -1,8 +1,10 @@ +import zlib from io import BytesIO import pytest from dissect.evidence.asdf.asdf import AsdfSnapshot, AsdfWriter +from dissect.evidence.asdf.streams import CompressedStream, Crc32Stream, HashedStream def noop(): @@ -144,3 +146,41 @@ def test_asdf_metadata(): with pytest.raises(KeyError): reader.metadata.open("nonexistent") + + +def test_asdf_stream_crc32(): + fh = BytesIO() + stream = Crc32Stream(fh) + stream.write(b"srt was here") + stream.finalize() + + assert fh.getvalue() == b"srt was here\x2f\x0e\x60\xa4" + + +def test_asdf_stream_compressed(): + fh = BytesIO() + stream = CompressedStream(fh) + stream.write(b"srt was here" * 100) + stream.finalize() + + assert zlib.decompress(fh.getvalue()) == b"srt was here" * 100 + + +def test_asdf_stream_hashed(): + fh = BytesIO() + stream = HashedStream(fh) + stream.write(b"srt was here") + assert stream.hexdigest() == "cd7bd850d261f8fa39a41d0963b42dae5f303615db19ac79e5044586d0825b7b" + + +def test_asdf_stream_combined(): + fh = BytesIO() + stream = Crc32Stream(fh) + stream = CompressedStream(stream) + stream = HashedStream(stream) + + stream.write(b"srt was here" * 100) + stream.finalize() + + assert stream.hexdigest() == "ba40ab3ee826d6aa0f085dfccbb72d8feefa72548015c4456c1fd741d0266a94" + assert fh.getvalue() == bytes.fromhex("789c2b2e2a51284f2c56c8482d4a2d1e658fb247d9a3ec41cc06004445c530665f35fc") From 4d45d5366737638e08c1849bce0e385b63833783 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Tue, 1 Nov 2022 16:47:58 +0100 Subject: [PATCH 05/12] Fix some docstrings and rename a method --- dissect/evidence/asdf/asdf.py | 8 +++++--- tests/test_asdf.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index ab8250d..3372f6b 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -132,8 +132,8 @@ def __init__( self._write_header() - def add_metadata(self, path: str, fh: BinaryIO, size: Optional[int] = None) -> None: - """Add a file to the metadata. + def add_metadata_file(self, path: str, fh: BinaryIO, size: Optional[int] = None) -> None: + """Add a file to the metadata stream. Args: path: The path in the metadata tar to write to. @@ -155,7 +155,9 @@ def add_metadata(self, path: str, fh: BinaryIO, size: Optional[int] = None) -> N self._meta_tar.addfile(info, fh) def add_bytes(self, data: bytes, idx: int = 0, base: int = 0) -> None: - """Copy some bytes into this snapshot. + """Add some bytes into this snapshot. + + Convenience method for adding some bytes at a specific offset. Args: data: The bytes to copy. diff --git a/tests/test_asdf.py b/tests/test_asdf.py index e3ec692..568d3e3 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -132,8 +132,8 @@ def test_asdf_metadata(): writer = AsdfWriter(fh) - writer.add_metadata("file", BytesIO(b"content")) - writer.add_metadata("dir/file", BytesIO(b"content here too")) + writer.add_metadata_file("file", BytesIO(b"content")) + writer.add_metadata_file("dir/file", BytesIO(b"content here too")) writer.close() fh.seek(0) From cf09e2c5b833868995e7cdbcf3817beb67af6f35 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Tue, 1 Nov 2022 16:48:55 +0100 Subject: [PATCH 06/12] Add missing return types --- dissect/evidence/asdf/asdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 3372f6b..835d3a3 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -385,7 +385,7 @@ def __init__(self, fh: BinaryIO): self.metadata = Metadata(self) - def _parse_block_table(self): + def _parse_block_table(self) -> None: """Parse the block table, getting rid of overlapping blocks.""" table_offset = self.footer.table_offset table_size = self.footer_offset - table_offset @@ -398,7 +398,7 @@ def _parse_block_table(self): entry = c_asdf.table_entry(table_data) self._table_insert(entry.idx, entry.offset, entry.size, entry.file_offset) - def _table_insert(self, idx: int, offset: int, size: int, file_offset: int): + def _table_insert(self, idx: int, offset: int, size: int, file_offset: int) -> None: stream_idx = idx entry_data_offset = file_offset + len(c_asdf.block) From 2b563fe5896ded6d0f0275e1fbbb5a09f85687a8 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 7 Nov 2022 10:24:52 +0100 Subject: [PATCH 07/12] Add more type hints --- dissect/evidence/asdf/asdf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 835d3a3..dddaac0 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -505,7 +505,7 @@ def __init__(self, asdf: AsdfSnapshot, idx: int): size = self.table[-1][0] + self.table[-1][1] super().__init__(size) - def _read(self, offset: int, length: int): + def _read(self, offset: int, length: int) -> bytes: r = [] size = self.size @@ -578,7 +578,9 @@ def _read(self, offset: int, length: int): return b"".join(r) -def _table_fit(entry_offset: int, entry_size: int, entry_table: list, lookup_table: list, getentry: Callable): +def _table_fit( + entry_offset: int, entry_size: int, entry_table: list, lookup_table: list, getentry: Callable +) -> tuple[int, int, int]: """Calculate where to insert an entry with the given offset and size into the entry table. Moves or shrinks the entry to prevent block overlap, and remove any overlapping blocks. From be68242f2f80f19b16dc9dc7d099467e8d89bbc7 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 7 Nov 2022 17:17:44 +0100 Subject: [PATCH 08/12] Add more tests and an edge case --- dissect/evidence/asdf/asdf.py | 4 +++ tests/test_asdf.py | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index dddaac0..0676812 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -632,4 +632,8 @@ def _table_fit( # The next block overlaps with this block, so shrink this block entry_end = next_start + if entry_offset >= entry_end: + # Shouldn't be possible to go beyond the end, but we may end up with a 0 sized block + return None, None, None + return table_idx, entry_offset, entry_end - entry_offset diff --git a/tests/test_asdf.py b/tests/test_asdf.py index 568d3e3..fef6f20 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -126,6 +126,57 @@ def test_asdf_overlap_all(): assert stream.read() == (b"\x01" * 100) + (b"\x06" * 300) +def test_asdf_overlap_contiguous(): + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + + writer = AsdfWriter(fh) + + writer.add_bytes(b"\x01" * 100, base=0) + writer.add_bytes(b"\x02" * 100, base=100) + assert writer._table_lookup[0] == [0, 100] + + writer.add_bytes(b"\x03" * 75, base=50) + assert writer._table_lookup[0] == [0, 100] + + writer.close() + fh.seek(0) + + reader = AsdfSnapshot(fh) + stream = reader.open(0) + + assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + (0, 100), + (100, 100), + ] + assert stream.read() == (b"\x01" * 100) + (b"\x02" * 100) + + +def test_asdf_overlap_seek(): + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + + writer = AsdfWriter(fh) + + writer.add_bytes(b"\x00" * 100, base=0) + writer.add_bytes(b"\x00" * 100, base=200) + writer.add_bytes(bytes(range(200)), base=50) + assert writer._table_lookup[0] == [0, 100, 200] + + writer.close() + fh.seek(0) + + reader = AsdfSnapshot(fh) + stream = reader.open(0) + + assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ + (0, 100), + (100, 100), + (200, 100), + ] + assert stream.read() == (b"\x00" * 100) + bytes(range(50, 150)) + (b"\x00" * 100) + + def test_asdf_metadata(): fh = BytesIO() fh.close = noop # Prevent clearing the buffer, we need it From aeb1a890d311f7543fd4f16f26167078257e605a Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Thu, 10 Nov 2022 15:18:34 +0100 Subject: [PATCH 09/12] Additional fixes --- dissect/evidence/asdf/asdf.py | 10 +++++----- tests/test_asdf.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 0676812..1ffed05 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -13,7 +13,7 @@ from dissect import cstruct from dissect.util import ts -from dissect.util.stream import AlignedStream, RangeStream +from dissect.util.stream import AlignedStream from dissect.evidence.asdf.streams import CompressedStream, Crc32Stream, HashedStream from dissect.evidence.exceptions import ( @@ -309,8 +309,8 @@ def _write_block(self, source: BinaryIO, offset: int, size: int, idx: int = 0, b block.write(self.fh) data_offset = self.fh.tell() # Block data location - source_stream = RangeStream(source, offset, size) - shutil.copyfileobj(source_stream, outfh) + source.seek(offset) + shutil.copyfileobj(source, outfh, size) # This writes any remaining data or footer for each block writer outfh.finalize() @@ -512,7 +512,7 @@ def _read(self, offset: int, length: int) -> bytes: run_idx = bisect_right(self._table_lookup, offset) - 1 runlist_len = len(self.table) - while length > 0 and run_idx < len(self.table): + while length > 0 and run_idx < runlist_len: run_start, run_size, run_file_offset, run_data_offset = self.table[run_idx] run_end = run_start + run_size @@ -566,7 +566,7 @@ def _read(self, offset: int, length: int) -> bytes: raise InvalidBlock("invalid block magic") # Skip over block header - self.fh.seek(run_data_offset) + self.fh.seek(run_data_offset + run_pos) r.append(self.fh.read(read_count)) # Proceed to next run diff --git a/tests/test_asdf.py b/tests/test_asdf.py index fef6f20..dc6918d 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -177,6 +177,25 @@ def test_asdf_overlap_seek(): assert stream.read() == (b"\x00" * 100) + bytes(range(50, 150)) + (b"\x00" * 100) +def test_asdf_mid_run(): + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + + writer = AsdfWriter(fh) + + writer.add_bytes(bytes([v & 0xFF for v in range(4096)]), base=0) + + writer.close() + fh.seek(0) + + reader = AsdfSnapshot(fh) + stream = reader.open(0) + stream.align = 512 + + stream.seek(1100) + assert stream.read(512) == bytes([v & 0xFF for v in range(1100, 1100 + 512)]) + + def test_asdf_metadata(): fh = BytesIO() fh.close = noop # Prevent clearing the buffer, we need it From 009bfa2c969219a25052942f3f72cbf158b38550 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Thu, 10 Nov 2022 15:24:04 +0100 Subject: [PATCH 10/12] Additional small changes --- dissect/evidence/asdf/asdf.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 1ffed05..4d0c220 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -9,7 +9,7 @@ import uuid from bisect import bisect_right from collections import defaultdict -from typing import BinaryIO, Callable, Optional, Tuple +from typing import BinaryIO, Callable, Optional from dissect import cstruct from dissect.util import ts @@ -22,6 +22,8 @@ UnsupportedVersion, ) +SnapshotTableEntry = tuple[int, int, int, int] + VERSION = 1 DEFAULT_BLOCK_SIZE = 4096 MAX_BLOCK_TABLE_SIZE = 2**32 @@ -213,7 +215,7 @@ def copy_block( def copy_runlist( self, source: BinaryIO, - runlist: list[Tuple[Optional[int], int]], + runlist: list[tuple[Optional[int], int]], runlist_block_size: int, idx: int = 0, base: int = 0, @@ -371,30 +373,28 @@ def __init__(self, fh: BinaryIO): self.timestamp = ts.from_unix(self.header.timestamp) self.guid = uuid.UUID(bytes_le=self.header.guid) - self.table: dict[list[Tuple[int, int, int, int]]] = defaultdict(list) + self.table: dict[list[SnapshotTableEntry]] = defaultdict(list) self._table_lookup: dict[list[int]] = defaultdict(list) - self.fh.seek(-len(c_asdf.footer), io.SEEK_END) - self.footer_offset = self.fh.tell() + footer_offset = self.fh.seek(-len(c_asdf.footer), io.SEEK_END) self.footer = c_asdf.footer(self.fh) if self.footer.magic != FOOTER_MAGIC: raise InvalidSnapshot("invalid footer magic") - self._parse_block_table() + self._parse_block_table( + self.footer.table_offset, + (footer_offset - self.footer.table_offset) // len(c_asdf.table_entry), + ) self.metadata = Metadata(self) - def _parse_block_table(self) -> None: + def _parse_block_table(self, offset: int, count: int) -> None: """Parse the block table, getting rid of overlapping blocks.""" - table_offset = self.footer.table_offset - table_size = self.footer_offset - table_offset - table_count = table_size // len(c_asdf.table_entry) - - self.fh.seek(table_offset) - table_data = io.BytesIO(self.fh.read(table_size)) + self.fh.seek(offset) + table_data = io.BytesIO(self.fh.read(count * len(c_asdf.table_entry))) - for _ in range(table_count): + for _ in range(count): entry = c_asdf.table_entry(table_data) self._table_insert(entry.idx, entry.offset, entry.size, entry.file_offset) From 79bc53e9c4718da4b5d2d87a3fbd08d256b344c3 Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 14 Nov 2022 13:16:12 +0100 Subject: [PATCH 11/12] Fixes after review --- dissect/evidence/asdf/asdf.py | 12 +-- tests/conftest.py | 14 +++ tests/test_asdf.py | 177 +++++++++++++--------------------- 3 files changed, 89 insertions(+), 114 deletions(-) diff --git a/dissect/evidence/asdf/asdf.py b/dissect/evidence/asdf/asdf.py index 4d0c220..3a80a8d 100644 --- a/dissect/evidence/asdf/asdf.py +++ b/dissect/evidence/asdf/asdf.py @@ -506,7 +506,7 @@ def __init__(self, asdf: AsdfSnapshot, idx: int): super().__init__(size) def _read(self, offset: int, length: int) -> bytes: - r = [] + result = [] size = self.size run_idx = bisect_right(self._table_lookup, offset) - 1 @@ -529,7 +529,7 @@ def _read(self, offset: int, length: int) -> bytes: sparse_remaining = next_run_start - offset read_count = min(size - offset, min(sparse_remaining, length)) - r.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) + result.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) # Proceed to next run run_idx += 1 @@ -544,7 +544,7 @@ def _read(self, offset: int, length: int) -> bytes: sparse_remaining = sparse_size - sparse_pos read_count = min(size - offset, min(sparse_remaining, length)) - r.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) + result.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) # Proceed to next run run_idx += 1 @@ -552,7 +552,7 @@ def _read(self, offset: int, length: int) -> bytes: # Previous run consumed, and next run is far away sparse_remaining = run_start - offset read_count = min(size - offset, min(sparse_remaining, length)) - r.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) + result.append(SPARSE_BYTES * (read_count // len(SPARSE_BYTES))) # Don't proceed to next run, next loop iteration we'll be within the current run else: @@ -567,7 +567,7 @@ def _read(self, offset: int, length: int) -> bytes: # Skip over block header self.fh.seek(run_data_offset + run_pos) - r.append(self.fh.read(read_count)) + result.append(self.fh.read(read_count)) # Proceed to next run run_idx += 1 @@ -575,7 +575,7 @@ def _read(self, offset: int, length: int) -> bytes: offset += read_count length -= read_count - return b"".join(r) + return b"".join(result) def _table_fit( diff --git a/tests/conftest.py b/tests/conftest.py index 88228b3..b9ecbb2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,10 @@ import os +from io import BytesIO + import pytest +from dissect.evidence.asdf import AsdfWriter + def open_data(name): return open(os.path.join(os.path.dirname(__file__), name), "rb") @@ -24,3 +28,13 @@ def ad1_data_compressed(): @pytest.fixture def ewf_data(): return open_data("data/ewf.E01") + + +@pytest.fixture +def asdf_writer(): + def noop(): + pass + + fh = BytesIO() + fh.close = noop # Prevent clearing the buffer, we need it + yield AsdfWriter(fh) diff --git a/tests/test_asdf.py b/tests/test_asdf.py index dc6918d..4ec0773 100644 --- a/tests/test_asdf.py +++ b/tests/test_asdf.py @@ -7,32 +7,23 @@ from dissect.evidence.asdf.streams import CompressedStream, Crc32Stream, HashedStream -def noop(): - pass - - -def test_asdf(): - fh = BytesIO() - fh.close = noop # Prevent clearing the buffer, we need it - - writer = AsdfWriter(fh) - - writer.add_bytes(b"\x00" * 0x1000, idx=0, base=0) - writer.add_bytes(b"\x02" * 0x1000, idx=0, base=0x4000) - writer.add_bytes(b"\x04" * 0x1000, idx=0, base=0x8000) - writer.add_bytes(b"\x06" * 0x1000, idx=0, base=0x10000) - writer.add_bytes(b"\xff" * 0x1000, idx=0, base=0x14000) - - writer.add_bytes(b"\x08" * 0x1000, idx=1, base=0x2000) - writer.add_bytes(b"\x10" * 0x1000, idx=1, base=0x5000) - writer.add_bytes(b"\x12" * 0x1000, idx=1, base=0x8000) - writer.add_bytes(b"\x14" * 0x1000, idx=1, base=0xB000) - writer.add_bytes(b"\xff" * 0x1000, idx=1, base=0xE000) - - writer.close() - fh.seek(0) - - reader = AsdfSnapshot(fh) +def test_asdf(asdf_writer: AsdfWriter): + asdf_writer.add_bytes(b"\x00" * 0x1000, idx=0, base=0) + asdf_writer.add_bytes(b"\x02" * 0x1000, idx=0, base=0x4000) + asdf_writer.add_bytes(b"\x04" * 0x1000, idx=0, base=0x8000) + asdf_writer.add_bytes(b"\x06" * 0x1000, idx=0, base=0x10000) + asdf_writer.add_bytes(b"\xff" * 0x1000, idx=0, base=0x14000) + + asdf_writer.add_bytes(b"\x08" * 0x1000, idx=1, base=0x2000) + asdf_writer.add_bytes(b"\x10" * 0x1000, idx=1, base=0x5000) + asdf_writer.add_bytes(b"\x12" * 0x1000, idx=1, base=0x8000) + asdf_writer.add_bytes(b"\x14" * 0x1000, idx=1, base=0xB000) + asdf_writer.add_bytes(b"\xff" * 0x1000, idx=1, base=0xE000) + + asdf_writer.close() + asdf_writer._fh.seek(0) + + reader = AsdfSnapshot(asdf_writer._fh) stream_0 = reader.open(0) assert [(run_start, run_size) for run_start, run_size, _, _ in stream_0.table] == [ (0, 0x1000), @@ -65,29 +56,24 @@ def test_asdf(): assert stream_1.read(0x4000) == (b"\xa5\xdf" * (0x2000 // 2)) + (b"\x08" * 0x1000) + (b"\xa5\xdf" * (0x1000 // 2)) -def test_asdf_overlap(): - fh = BytesIO() - fh.close = noop # Prevent clearing the buffer, we need it - - writer = AsdfWriter(fh) - - writer.add_bytes(b"\x01" * 100, base=0) - writer.add_bytes(b"\x02" * 100, base=200) - assert writer._table_lookup[0] == [0, 200] +def test_asdf_overlap(asdf_writer: AsdfWriter): + asdf_writer.add_bytes(b"\x01" * 100, base=0) + asdf_writer.add_bytes(b"\x02" * 100, base=200) + assert asdf_writer._table_lookup[0] == [0, 200] - writer.add_bytes(b"\x03" * 100, base=50) - assert writer._table_lookup[0] == [0, 100, 200] + asdf_writer.add_bytes(b"\x03" * 100, base=50) + assert asdf_writer._table_lookup[0] == [0, 100, 200] - writer.add_bytes(b"\x04" * 150, base=100) - assert writer._table_lookup[0] == [0, 100, 150, 200] + asdf_writer.add_bytes(b"\x04" * 150, base=100) + assert asdf_writer._table_lookup[0] == [0, 100, 150, 200] - writer.add_bytes(b"\x05" * 50, base=25) - assert writer._table_lookup[0] == [0, 100, 150, 200] + asdf_writer.add_bytes(b"\x05" * 50, base=25) + assert asdf_writer._table_lookup[0] == [0, 100, 150, 200] - writer.close() - fh.seek(0) + asdf_writer.close() + asdf_writer._fh.seek(0) - reader = AsdfSnapshot(fh) + reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ @@ -99,24 +85,19 @@ def test_asdf_overlap(): assert stream.read() == (b"\x01" * 100) + (b"\x03" * 50) + (b"\x04" * 50) + (b"\x02" * 100) -def test_asdf_overlap_all(): - fh = BytesIO() - fh.close = noop # Prevent clearing the buffer, we need it - - writer = AsdfWriter(fh) - - writer.add_bytes(b"\x01" * 100, base=0) - writer.add_bytes(b"\x02" * 100, base=200) - writer.add_bytes(b"\x03" * 100, base=50) - writer.add_bytes(b"\x04" * 150, base=100) - assert writer._table_lookup[0] == [0, 100, 150, 200] - writer.add_bytes(b"\x06" * 400, base=0) - assert writer._table_lookup[0] == [0, 100] +def test_asdf_overlap_all(asdf_writer: AsdfWriter): + asdf_writer.add_bytes(b"\x01" * 100, base=0) + asdf_writer.add_bytes(b"\x02" * 100, base=200) + asdf_writer.add_bytes(b"\x03" * 100, base=50) + asdf_writer.add_bytes(b"\x04" * 150, base=100) + assert asdf_writer._table_lookup[0] == [0, 100, 150, 200] + asdf_writer.add_bytes(b"\x06" * 400, base=0) + assert asdf_writer._table_lookup[0] == [0, 100] - writer.close() - fh.seek(0) + asdf_writer.close() + asdf_writer._fh.seek(0) - reader = AsdfSnapshot(fh) + reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ @@ -126,23 +107,18 @@ def test_asdf_overlap_all(): assert stream.read() == (b"\x01" * 100) + (b"\x06" * 300) -def test_asdf_overlap_contiguous(): - fh = BytesIO() - fh.close = noop # Prevent clearing the buffer, we need it - - writer = AsdfWriter(fh) - - writer.add_bytes(b"\x01" * 100, base=0) - writer.add_bytes(b"\x02" * 100, base=100) - assert writer._table_lookup[0] == [0, 100] +def test_asdf_overlap_contiguous(asdf_writer: AsdfWriter): + asdf_writer.add_bytes(b"\x01" * 100, base=0) + asdf_writer.add_bytes(b"\x02" * 100, base=100) + assert asdf_writer._table_lookup[0] == [0, 100] - writer.add_bytes(b"\x03" * 75, base=50) - assert writer._table_lookup[0] == [0, 100] + asdf_writer.add_bytes(b"\x03" * 75, base=50) + assert asdf_writer._table_lookup[0] == [0, 100] - writer.close() - fh.seek(0) + asdf_writer.close() + asdf_writer._fh.seek(0) - reader = AsdfSnapshot(fh) + reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ @@ -152,21 +128,16 @@ def test_asdf_overlap_contiguous(): assert stream.read() == (b"\x01" * 100) + (b"\x02" * 100) -def test_asdf_overlap_seek(): - fh = BytesIO() - fh.close = noop # Prevent clearing the buffer, we need it - - writer = AsdfWriter(fh) - - writer.add_bytes(b"\x00" * 100, base=0) - writer.add_bytes(b"\x00" * 100, base=200) - writer.add_bytes(bytes(range(200)), base=50) - assert writer._table_lookup[0] == [0, 100, 200] +def test_asdf_overlap_seek(asdf_writer: AsdfWriter): + asdf_writer.add_bytes(b"\x00" * 100, base=0) + asdf_writer.add_bytes(b"\x00" * 100, base=200) + asdf_writer.add_bytes(bytes(range(200)), base=50) + assert asdf_writer._table_lookup[0] == [0, 100, 200] - writer.close() - fh.seek(0) + asdf_writer.close() + asdf_writer._fh.seek(0) - reader = AsdfSnapshot(fh) + reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) assert [(run_start, run_size) for run_start, run_size, _, _ in stream.table] == [ @@ -177,18 +148,13 @@ def test_asdf_overlap_seek(): assert stream.read() == (b"\x00" * 100) + bytes(range(50, 150)) + (b"\x00" * 100) -def test_asdf_mid_run(): - fh = BytesIO() - fh.close = noop # Prevent clearing the buffer, we need it - - writer = AsdfWriter(fh) - - writer.add_bytes(bytes([v & 0xFF for v in range(4096)]), base=0) +def test_asdf_mid_run(asdf_writer: AsdfWriter): + asdf_writer.add_bytes(bytes([v & 0xFF for v in range(4096)]), base=0) - writer.close() - fh.seek(0) + asdf_writer.close() + asdf_writer._fh.seek(0) - reader = AsdfSnapshot(fh) + reader = AsdfSnapshot(asdf_writer._fh) stream = reader.open(0) stream.align = 512 @@ -196,19 +162,14 @@ def test_asdf_mid_run(): assert stream.read(512) == bytes([v & 0xFF for v in range(1100, 1100 + 512)]) -def test_asdf_metadata(): - fh = BytesIO() - fh.close = noop # Prevent clearing the buffer, we need it - - writer = AsdfWriter(fh) - - writer.add_metadata_file("file", BytesIO(b"content")) - writer.add_metadata_file("dir/file", BytesIO(b"content here too")) +def test_asdf_metadata(asdf_writer: AsdfWriter): + asdf_writer.add_metadata_file("file", BytesIO(b"content")) + asdf_writer.add_metadata_file("dir/file", BytesIO(b"content here too")) - writer.close() - fh.seek(0) + asdf_writer.close() + asdf_writer._fh.seek(0) - reader = AsdfSnapshot(fh) + reader = AsdfSnapshot(asdf_writer._fh) assert reader.metadata.names() == ["file", "dir/file"] assert reader.metadata.open("file").read() == b"content" From f8d4e9469fcd7f28f21bd95ce4485a1b73d9e62c Mon Sep 17 00:00:00 2001 From: Schamper <1254028+Schamper@users.noreply.github.com> Date: Mon, 14 Nov 2022 13:16:26 +0100 Subject: [PATCH 12/12] Add AsdfWriter to __init__.py --- dissect/evidence/asdf/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dissect/evidence/asdf/__init__.py b/dissect/evidence/asdf/__init__.py index e79ac16..43fc6d8 100644 --- a/dissect/evidence/asdf/__init__.py +++ b/dissect/evidence/asdf/__init__.py @@ -1,7 +1,8 @@ -from dissect.evidence.asdf.asdf import AsdfStream, FILE_MAGIC, AsdfSnapshot +from dissect.evidence.asdf.asdf import FILE_MAGIC, AsdfSnapshot, AsdfStream, AsdfWriter __all__ = [ + "FILE_MAGIC", "AsdfSnapshot", "AsdfStream", - "FILE_MAGIC", + "AsdfWriter", ]