Skip to content

Commit

Permalink
support decompressing fixed huffman blocks
Browse files Browse the repository at this point in the history
Change-Id: I5a30394b46e113595336e89c954e03d3acb120fe
  • Loading branch information
garymm committed Mar 10, 2024
1 parent cdd7b97 commit 1b7d011
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 24 deletions.
24 changes: 22 additions & 2 deletions huffman/src/bit_span.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ namespace starflate::huffman {
/// A non-owning span of bits. Allows for iteration over the individual bits.
class bit_span : public std::ranges::view_interface<bit_span>
{

public:
// TODO: make private
const std::byte* data_{nullptr};
std::size_t bit_size_{};
std::uint8_t bit_offset_{}; // always less than CHAR_BIT

public:
/// An iterator over the bits in a bit_span.
class iterator : public detail::iterator_interface<iterator>
{
Expand Down Expand Up @@ -142,6 +143,25 @@ class bit_span : public std::ranges::view_interface<bit_span>

constexpr auto pop_16() -> std::uint16_t { return pop<std::uint16_t>(); }

/// Removes n bits from the beginning of this and returns them.
///
/// @pre this contains at least n bits.
///
constexpr auto pop_n(std::uint8_t n) -> std::uint16_t
{
assert(n <= 16);
assert(n <= bit_size_);
auto iter = begin();
std::uint16_t res{};
for (std::uint8_t i{}; i < n; i++) {
res |= static_cast<std::uint16_t>(
static_cast<std::uint16_t>(static_cast<bool>(*iter)) << i);
iter += 1;
}
consume(n); // invalidates iter, so must come after the loop
return res;
}

/// Consumes the given number of bits. Advances the start of the view.
///
/// @pre n <= std::ranges::size(*this)
Expand Down
48 changes: 38 additions & 10 deletions huffman/src/decode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,58 @@ namespace starflate::huffman {
/// @tparam Symbol The type of the symbols in the code table.
/// @tparam Extent The extent of the code table.
/// @tparam O The type of the output iterator.
template <
symbol Symbol,
std::size_t Extent = std::dynamic_extent,
std::output_iterator<Symbol> O>
template <symbol Symbol, std::size_t Extent, std::output_iterator<Symbol> O>
constexpr auto
decode(const table<Symbol, Extent>& code_table, bit_span bits, O output) -> O
{
while (!bits.empty()) {
auto result = decode_one(code_table, bits);
if (result.encoded_size == 0) {
break;
}
*output = result.symbol;
output++;
bits.consume(result.encoded_size);
}
return output;
}

Check warning on line 38 in huffman/src/decode.hpp

View check run for this annotation

Codecov / codecov/patch

huffman/src/decode.hpp#L28-L38

Added lines #L28 - L38 were not covered by tests

template <symbol Symbol>
struct decode_result
{
Symbol symbol;
std::uint8_t encoded_size;
};

/// Decodes a single symbol from \p bits using \p code_table.
///
/// @param code_table The code table to use for decoding.
/// @param bits The bit stream to decode.
///
/// @returns The decoded symbol and how many bits its code was.
/// @tparam Symbol The type of the symbols in the code table.
/// @tparam Extent The extent of the code table.
template <symbol Symbol, std::size_t Extent>
constexpr auto
decode_one(const table<Symbol, Extent>& code_table, bit_span bits)
-> decode_result<Symbol>
{
std::uint8_t bits_read{};
code current_code{};
auto code_table_pos = code_table.begin();
for (auto bit : bits) {
current_code << bit;
bits_read++;
auto found = code_table.find(current_code, code_table_pos);
if (found) {
*output = (*found)->symbol;
output++;
code_table_pos = code_table.begin();
current_code = code{};
continue;
return {(*found)->symbol, bits_read};
}
if (found.error() == code_table.end()) {
break;
}
code_table_pos = found.error();
}
return output;
return {Symbol{}, 0};

Check warning on line 75 in huffman/src/decode.hpp

View check run for this annotation

Codecov / codecov/patch

huffman/src/decode.hpp#L75

Added line #L75 was not covered by tests
}

} // namespace starflate::huffman
12 changes: 12 additions & 0 deletions huffman/test/bit_span_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ auto main() -> int
if (std::cmp_less(n, initial_bits.size())) {
expect(nth_bit(n) == bits[0]);
}
if (n == 0) {
expect(initial_bits.byte_data() == bits.byte_data());
}
} else {
expect(aborts([&] { bits.consume(n); }));
}
Expand Down Expand Up @@ -172,6 +175,15 @@ auto main() -> int
expect(eq(got_8, expected_8));

expect(aborts([&] { span.pop_8(); }));

span = huffman::bit_span{data};
const std::uint16_t got_5{span.pop_n(5)};
constexpr std::uint16_t expected_5{0b01010};
expect(eq(got_5, expected_5));

const std::uint16_t got_3{span.pop_n(3)};
constexpr std::uint16_t expected_3{0b101};
expect(eq(got_3, expected_3));
// NOLINTEND(readability-magic-numbers)
};
}
144 changes: 135 additions & 9 deletions src/decompress.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "decompress.hpp"

#include <cstdint>
#include <iostream>
#include <iterator>
#include <utility>

Expand Down Expand Up @@ -31,6 +32,125 @@ auto read_header(huffman::bit_span& compressed_bits)
return BlockHeader{final, type};
}

// RFC 3.2.6: static literal/length table
//
// literal/length bitsize code
// ============== ======= =========================
// 0 - 143 8 0011'0000 - 1011'1111
// 144 - 255 9 1'1001'0000 - 1'1111'1111
// 256 - 279 7 000'0000 - 001'0111
// 280 - 287 8 1100'0000 - 1100'0111

constexpr std::size_t fixed_len_table_size = 288;

constexpr auto fixed_len_table = // clang-format off
huffman::table<std::uint16_t, fixed_len_table_size>{
huffman::symbol_bitsize,
{{{ 0, 143}, 8},
{{144, 255}, 9},
{{256, 279}, 7},
{{280, 287}, 8}}};
// clang-format on

constexpr std::size_t fixed_dist_table_size = 32;

constexpr auto fixed_dist_table = huffman::table<
std::uint16_t,
fixed_dist_table_size>{huffman::symbol_bitsize, {{{0, 31}, 5}}};

// RFC 3.2.5: Compressed blocks (length and distance codes)
constexpr auto length_infos = std::array<LengthInfo, 28>{
{{0, 3}, {0, 4}, {0, 5}, {0, 6}, {0, 7}, {0, 8}, {0, 9},
{0, 10}, {1, 11}, {1, 13}, {1, 15}, {1, 17}, {2, 19}, {2, 23},
{2, 27}, {2, 31}, {3, 35}, {3, 43}, {3, 51}, {3, 59}, {4, 67},
{4, 83}, {4, 99}, {4, 115}, {5, 131}, {5, 163}, {5, 195}, {5, 227}}};

constexpr auto distance_infos = std::array<LengthInfo, 30>{
{{0, 1}, {0, 2}, {0, 3}, {0, 4}, {1, 5},
{1, 7}, {2, 9}, {2, 13}, {3, 17}, {3, 25},
{4, 33}, {4, 49}, {5, 65}, {5, 97}, {6, 129},
{6, 193}, {7, 257}, {7, 385}, {8, 513}, {8, 769},
{9, 1025}, {9, 1537}, {10, 2049}, {10, 3073}, {11, 4097},
{11, 6145}, {12, 8193}, {12, 12289}, {13, 16385}, {13, 24577}}};

auto decompress_block_huffman(
huffman::bit_span& src_bits,
std::span<std::byte> dst,
std::ptrdiff_t& dst_written,
const huffman::table<std::uint16_t, fixed_len_table_size>& len_table,
const huffman::table<std::uint16_t, fixed_dist_table_size>& dist_table)
-> DecompressStatus
{
std::uint16_t lit_or_len{};
while (true) {
const auto lit_or_len_decoded = huffman::decode_one(len_table, src_bits);
if (not lit_or_len_decoded.encoded_size) {
return DecompressStatus::InvalidLitOrLen;
}

Check warning on line 89 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L88-L89

Added lines #L88 - L89 were not covered by tests
lit_or_len = lit_or_len_decoded.symbol;
src_bits.consume(lit_or_len_decoded.encoded_size);
if (lit_or_len < detail::lit_or_len_end_of_block) {
dst[static_cast<std::size_t>(dst_written++)] =
static_cast<std::byte>(lit_or_len);
continue;
}
if (lit_or_len == detail::lit_or_len_end_of_block) {
break;
}
if (lit_or_len > detail::lit_or_len_max) {
return DecompressStatus::InvalidLitOrLen;
}

Check warning on line 102 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L101-L102

Added lines #L101 - L102 were not covered by tests
std::uint16_t len{};
if (lit_or_len == detail::lit_or_len_max) {
len = detail::lit_or_len_max_decoded;
} else {
const auto len_idx =
static_cast<size_t>(lit_or_len - detail::lit_or_len_end_of_block - 1);
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& len_info = detail::length_infos[len_idx];
const auto extra_len = src_bits.pop_n(len_info.extra_bits);
len = len_info.base + extra_len;
}
const auto dist_decoded = huffman::decode_one(dist_table, src_bits);
const auto dist_code = dist_decoded.symbol;
if (not dist_decoded.encoded_size) {
return DecompressStatus::InvalidDistance;
}

Check warning on line 118 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L117-L118

Added lines #L117 - L118 were not covered by tests
src_bits.consume(dist_decoded.encoded_size);
if (dist_code >= detail::distance_infos.size()) {
return DecompressStatus::InvalidLitOrLen;
}

Check warning on line 122 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L121-L122

Added lines #L121 - L122 were not covered by tests
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& dist_info = detail::distance_infos[dist_code];
const std::uint16_t distance =
dist_info.base + src_bits.pop_n(dist_info.extra_bits);
if (distance > dst_written) {
return DecompressStatus::InvalidDistance;
}

Check warning on line 129 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L128-L129

Added lines #L128 - L129 were not covered by tests
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
return DecompressStatus::DstTooSmall;
}

Check warning on line 132 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L131-L132

Added lines #L131 - L132 were not covered by tests
starflate::detail::copy_n(
dst.begin() + (dst_written - distance), len, dst.begin() + dst_written);
dst_written += len;
}
return DecompressStatus::Success;
}

void copy_n(
std::span<const std::byte>::iterator src,
std::uint16_t n,
std::span<std::byte>::iterator dst)
{
std::ptrdiff_t n_signed{n};
while (n_signed > 0) {
const auto n_to_copy = std::min(n_signed, dst - src);
std::copy_n(src, n_to_copy, dst);
n_signed -= n_to_copy;
dst += n_to_copy;
}
}

} // namespace detail

auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
Expand All @@ -39,7 +159,8 @@ auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
using enum detail::BlockType;

huffman::bit_span src_bits{src};
// std::size_t dst_written{};
// will always be > 0, but signed type to minimize conversions.
std::ptrdiff_t dst_written{};
for (bool was_final = false; not was_final;) {
const auto header = detail::read_header(src_bits);
if (not header) {
Expand All @@ -58,22 +179,27 @@ auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
return DecompressStatus::SrcTooSmall;
}

if (dst.size() < len) {
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
return DecompressStatus::DstTooSmall;
}

std::copy_n(src_bits.byte_data(), len, dst.begin());
std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written);
src_bits.consume(CHAR_BIT * len);
dst = dst.subspan(len);
// dst_written += len;
dst_written += len;
} else if (header->type == FixedHuffman) {
const auto block_status = detail::decompress_block_huffman(
src_bits,
dst,
dst_written,
detail::fixed_len_table,
detail::fixed_dist_table);
if (block_status != DecompressStatus::Success) {
return block_status;
}

Check warning on line 198 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L197-L198

Added lines #L197 - L198 were not covered by tests
} else {
// TODO: implement
return DecompressStatus::Error;

Check warning on line 201 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L201

Added line #L201 was not covered by tests
}
const auto distance =
std::distance(std::ranges::data(src), src_bits.byte_data());
assert(distance >= 0 and "distance must be positive");
src = src.subspan(static_cast<size_t>(distance));
}
return DecompressStatus::Success;
}
Expand Down
27 changes: 27 additions & 0 deletions src/decompress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "huffman/huffman.hpp"

#include <array>
#include <cstddef>
#include <expected>
#include <ranges>
Expand All @@ -18,6 +19,8 @@ enum class DecompressStatus : std::uint8_t
NoCompressionLenMismatch,
DstTooSmall,
SrcTooSmall,
InvalidLitOrLen,
InvalidDistance,
};

namespace detail {
Expand All @@ -37,6 +40,30 @@ struct BlockHeader

auto read_header(huffman::bit_span& compressed_bits)
-> std::expected<BlockHeader, DecompressStatus>;

struct LengthInfo
{
std::uint8_t extra_bits;
std::uint16_t base;
};

extern const huffman::table<std::uint16_t, 288> fixed_table;
extern const std::array<LengthInfo, 28> length_infos;
constexpr auto lit_or_len_end_of_block = std::uint16_t{256};
constexpr auto lit_or_len_max = std::uint16_t{285};
constexpr auto lit_or_len_max_decoded = std::uint16_t{258};

/// Copies n bytes from src to dst, repeating the source data if necessary.
///
/// From the standard section 3.2.3:
/// "Note also that the referenced string may overlap the current
/// position; for example, if the last 2 bytes decoded have values
/// X and Y, a string reference with <length = 5, distance = 2>
/// adds X,Y,X,Y,X to the output stream."
void copy_n(
std::span<const std::byte>::iterator src,
std::uint16_t n,
std::span<std::byte>::iterator dst);
} // namespace detail

/// Decompresses the given source data into the destination buffer.
Expand Down
1 change: 1 addition & 0 deletions src/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ cc_test(
timeout = "short",
srcs = ["decompress_test.cpp"],
data = [
":starfleet.html",
":starfleet.html.dynamic",
":starfleet.html.fixed",
],
Expand Down
Loading

0 comments on commit 1b7d011

Please sign in to comment.