Skip to content

Commit

Permalink
Merge-in SharedDictionary feature (#916)
Browse files Browse the repository at this point in the history
Co-authored-by: Eugene Kliuchnikov <eustas@chromium.org>
  • Loading branch information
eustas and Eugene Kliuchnikov committed Aug 4, 2021
1 parent 630b508 commit 19d86fb
Show file tree
Hide file tree
Showing 37 changed files with 3,066 additions and 38 deletions.
515 changes: 515 additions & 0 deletions c/common/shared_dictionary.c

Large diffs are not rendered by default.

74 changes: 74 additions & 0 deletions c/common/shared_dictionary_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/* Copyright 2017 Google Inc. All Rights Reserved.
Distributed under MIT license.
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/

/* (Transparent) Shared Dictionary definition. */

#ifndef BROTLI_COMMON_SHARED_DICTIONARY_INTERNAL_H_
#define BROTLI_COMMON_SHARED_DICTIONARY_INTERNAL_H_

#include "./dictionary.h"
#include <brotli/shared_dictionary.h>
#include "./transform.h"
#include <brotli/types.h>

#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif

struct BrotliSharedDictionaryStruct {
/* LZ77 prefixes (compound dictionary). */
uint32_t num_prefix; /* max SHARED_BROTLI_MAX_COMPOUND_DICTS */
size_t prefix_size[SHARED_BROTLI_MAX_COMPOUND_DICTS];
const uint8_t* prefix[SHARED_BROTLI_MAX_COMPOUND_DICTS];

/* If set, the context map is used to select word and transform list from 64
contexts, if not set, the context map is not used and only words[0] and
transforms[0] are to be used. */
BROTLI_BOOL context_based;

uint8_t context_map[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS];

/* Amount of word_list+transform_list combinations. */
uint8_t num_dictionaries;

/* Must use num_dictionaries values. */
const BrotliDictionary* words[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS];

/* Must use num_dictionaries values. */
const BrotliTransforms* transforms[SHARED_BROTLI_NUM_DICTIONARY_CONTEXTS];

/* Amount of custom word lists. May be 0 if only Brotli's built-in is used */
uint8_t num_word_lists;

/* Contents of the custom words lists. Must be NULL if num_word_lists is 0. */
BrotliDictionary* words_instances;

/* Amount of custom transform lists. May be 0 if only Brotli's built-in is
used */
uint8_t num_transform_lists;

/* Contents of the custom transform lists. Must be NULL if num_transform_lists
is 0. */
BrotliTransforms* transforms_instances;

/* Concatenated prefix_suffix_maps of the custom transform lists. Must be NULL
if num_transform_lists is 0. */
uint16_t* prefix_suffix_maps;

/* Memory management */
brotli_alloc_func alloc_func;
brotli_free_func free_func;
void* memory_manager_opaque;
};

typedef struct BrotliSharedDictionaryStruct BrotliSharedDictionaryInternal;
#define BrotliSharedDictionary BrotliSharedDictionaryInternal

#if defined(__cplusplus) || defined(c_plusplus)
} /* extern "C" */
#endif

#endif /* BROTLI_COMMON_SHARED_DICTIONARY_INTERNAL_H_ */
194 changes: 184 additions & 10 deletions c/dec/decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "../common/context.h"
#include "../common/dictionary.h"
#include "../common/platform.h"
#include "../common/shared_dictionary_internal.h"
#include "../common/transform.h"
#include "../common/version.h"
#include "./bit_reader.h"
Expand Down Expand Up @@ -42,8 +43,8 @@ extern "C" {
/* We need the slack region for the following reasons:
- doing up to two 16-byte copies for fast backward copying
- inserting transformed dictionary word:
5 prefix + 24 base + 8 suffix */
static const uint32_t kRingBufferWriteAheadSlack = 42;
255 prefix + 32 base + 255 suffix */
static const uint32_t kRingBufferWriteAheadSlack = 542;

static const uint8_t kCodeLengthCodeOrder[BROTLI_CODE_LENGTH_CODES] = {
1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
Expand Down Expand Up @@ -1403,6 +1404,114 @@ static BrotliDecoderErrorCode BROTLI_NOINLINE CopyUncompressedBlockToOutput(
BROTLI_DCHECK(0); /* Unreachable */
}

static BROTLI_BOOL AttachCompoundDictionary(
BrotliDecoderState* state, const uint8_t* data, size_t size) {
BrotliDecoderCompoundDictionary* addon = state->compound_dictionary;
if (state->state != BROTLI_STATE_UNINITED) return BROTLI_FALSE;
if (!addon) {
addon = (BrotliDecoderCompoundDictionary*)BROTLI_DECODER_ALLOC(
state, sizeof(BrotliDecoderCompoundDictionary));
if (!addon) return BROTLI_FALSE;
addon->num_chunks = 0;
addon->total_size = 0;
addon->br_length = 0;
addon->br_copied = 0;
addon->block_bits = -1;
addon->chunk_offsets[0] = 0;
state->compound_dictionary = addon;
}
if (addon->num_chunks == 15) return BROTLI_FALSE;
addon->chunks[addon->num_chunks] = data;
addon->num_chunks++;
addon->total_size += (int)size;
addon->chunk_offsets[addon->num_chunks] = addon->total_size;
return BROTLI_TRUE;
}

static void EnsureCoumpoundDictionaryInitialized(BrotliDecoderState* state) {
BrotliDecoderCompoundDictionary* addon = state->compound_dictionary;
/* 256 = (1 << 8) slots in block map. */
int block_bits = 8;
int cursor = 0;
int index = 0;
if (addon->block_bits != -1) return;
while (((addon->total_size - 1) >> block_bits) != 0) block_bits++;
block_bits -= 8;
addon->block_bits = block_bits;
while (cursor < addon->total_size) {
while (addon->chunk_offsets[index + 1] < cursor) index++;
addon->block_map[cursor >> block_bits] = (uint8_t)index;
cursor += 1 << block_bits;
}
}

static BROTLI_BOOL InitializeCompoundDictionaryCopy(BrotliDecoderState* s,
int address, int length) {
BrotliDecoderCompoundDictionary* addon = s->compound_dictionary;
int index;
EnsureCoumpoundDictionaryInitialized(s);
index = addon->block_map[address >> addon->block_bits];
while (address >= addon->chunk_offsets[index + 1]) index++;
if (addon->total_size < address + length) return BROTLI_FALSE;
/* Update the recent distances cache. */
s->dist_rb[s->dist_rb_idx & 3] = s->distance_code;
++s->dist_rb_idx;
s->meta_block_remaining_len -= length;
addon->br_index = index;
addon->br_offset = address - addon->chunk_offsets[index];
addon->br_length = length;
addon->br_copied = 0;
return BROTLI_TRUE;
}

static int GetCompoundDictionarySize(BrotliDecoderState* s) {
return s->compound_dictionary ? s->compound_dictionary->total_size : 0;
}

static int CopyFromCompoundDictionary(BrotliDecoderState* s, int pos) {
BrotliDecoderCompoundDictionary* addon = s->compound_dictionary;
int orig_pos = pos;
while (addon->br_length != addon->br_copied) {
uint8_t* copy_dst = &s->ringbuffer[pos];
const uint8_t* copy_src =
addon->chunks[addon->br_index] + addon->br_offset;
int space = s->ringbuffer_size - pos;
int rem_chunk_length = (addon->chunk_offsets[addon->br_index + 1] -
addon->chunk_offsets[addon->br_index]) - addon->br_offset;
int length = addon->br_length - addon->br_copied;
if (length > rem_chunk_length) length = rem_chunk_length;
if (length > space) length = space;
memcpy(copy_dst, copy_src, (size_t)length);
pos += length;
addon->br_offset += length;
addon->br_copied += length;
if (length == rem_chunk_length) {
addon->br_index++;
addon->br_offset = 0;
}
if (pos == s->ringbuffer_size) break;
}
return pos - orig_pos;
}

BROTLI_BOOL BrotliDecoderAttachDictionary(BrotliDecoderState* state,
BrotliSharedDictionaryType type, size_t data_size, const uint8_t* data) {
uint32_t i;
uint32_t num_prefix_before = state->dictionary->num_prefix;
if (state->state != BROTLI_STATE_UNINITED) return BROTLI_FALSE;
if (!BrotliSharedDictionaryAttach(state->dictionary, type, data_size, data)) {
return BROTLI_FALSE;
}
for (i = num_prefix_before; i < state->dictionary->num_prefix; i++) {
if (!AttachCompoundDictionary(
state, state->dictionary->prefix[i],
state->dictionary->prefix_size[i])) {
return BROTLI_FALSE;
}
}
return BROTLI_TRUE;
}

/* Calculates the smallest feasible ring buffer.
If we know the data size is small, do not allocate more ring buffer
Expand Down Expand Up @@ -1737,6 +1846,7 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
int i = s->loop_counter;
BrotliDecoderErrorCode result = BROTLI_DECODER_SUCCESS;
BrotliBitReader* br = &s->br;
int compound_dictionary_size = GetCompoundDictionarySize(s);

if (!CheckInputAmount(safe, br, 28)) {
result = BROTLI_DECODER_NEEDS_MORE_INPUT;
Expand Down Expand Up @@ -1903,20 +2013,75 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
pos, s->distance_code, i, s->meta_block_remaining_len));
return BROTLI_FAILURE(BROTLI_DECODER_ERROR_FORMAT_DISTANCE);
}
if (i >= BROTLI_MIN_DICTIONARY_WORD_LENGTH &&
i <= BROTLI_MAX_DICTIONARY_WORD_LENGTH) {
int address = s->distance_code - s->max_distance - 1;
const BrotliDictionary* words = s->dictionary;
const BrotliTransforms* transforms = s->transforms;
int offset = (int)s->dictionary->offsets_by_length[i];
uint32_t shift = s->dictionary->size_bits_by_length[i];

if (s->distance_code - s->max_distance - 1 < compound_dictionary_size) {
int address = compound_dictionary_size -
(s->distance_code - s->max_distance);
if (!InitializeCompoundDictionaryCopy(s, address, i)) {
return BROTLI_FAILURE(BROTLI_DECODER_ERROR_COMPOUND_DICTIONARY);
}
pos += CopyFromCompoundDictionary(s, pos);
if (pos >= s->ringbuffer_size) {
s->state = BROTLI_STATE_COMMAND_POST_WRITE_1;
goto saveStateAndReturn;
}
} else if (i >= SHARED_BROTLI_MIN_DICTIONARY_WORD_LENGTH &&
i <= SHARED_BROTLI_MAX_DICTIONARY_WORD_LENGTH) {
uint8_t p1 = s->ringbuffer[(pos - 1) & s->ringbuffer_mask];
uint8_t p2 = s->ringbuffer[(pos - 2) & s->ringbuffer_mask];
uint8_t dict_id = s->dictionary->context_based ?
s->dictionary->context_map[BROTLI_CONTEXT(p1, p2, s->context_lookup)]
: 0;
const BrotliDictionary* words = s->dictionary->words[dict_id];
const BrotliTransforms* transforms = s->dictionary->transforms[dict_id];
int offset = (int)words->offsets_by_length[i];
uint32_t shift = words->size_bits_by_length[i];
int address =
s->distance_code - s->max_distance - 1 - compound_dictionary_size;
int mask = (int)BitMask(shift);
int word_idx = address & mask;
int transform_idx = address >> shift;
/* Compensate double distance-ring-buffer roll. */
s->dist_rb_idx += s->distance_context;
offset += word_idx * i;
/* If the distance is out of bound, select a next static dictionary if
there exist multiple. */
if ((transform_idx >= (int)transforms->num_transforms ||
words->size_bits_by_length[i] == 0) &&
s->dictionary->num_dictionaries > 1) {
uint8_t dict_id2;
int dist_remaining = address -
(int)(((1u << shift) & ~1u)) * (int)transforms->num_transforms;
for (dict_id2 = 0; dict_id2 < s->dictionary->num_dictionaries;
dict_id2++) {
const BrotliDictionary* words2 = s->dictionary->words[dict_id2];
if (dict_id2 != dict_id && words2->size_bits_by_length[i] != 0) {
const BrotliTransforms* transforms2 =
s->dictionary->transforms[dict_id2];
uint32_t shift2 = words2->size_bits_by_length[i];
int num = (int)((1u << shift2) & ~1u) *
(int)transforms2->num_transforms;
if (dist_remaining < num) {
dict_id = dict_id2;
words = words2;
transforms = transforms2;
address = dist_remaining;
shift = shift2;
mask = (int)BitMask(shift);
word_idx = address & mask;
transform_idx = address >> shift;
offset = (int)words->offsets_by_length[i] + word_idx * i;
break;
}
dist_remaining -= num;
}
}
}
if (BROTLI_PREDICT_FALSE(words->size_bits_by_length[i] == 0)) {
BROTLI_LOG(("Invalid backward reference. pos: %d distance: %d "
"len: %d bytes left: %d\n",
pos, s->distance_code, i, s->meta_block_remaining_len));
return BROTLI_FAILURE(BROTLI_DECODER_ERROR_FORMAT_DICTIONARY);
}
if (BROTLI_PREDICT_FALSE(!words->data)) {
return BROTLI_FAILURE(BROTLI_DECODER_ERROR_DICTIONARY_NOT_SET);
}
Expand All @@ -1933,6 +2098,10 @@ static BROTLI_INLINE BrotliDecoderErrorCode ProcessCommandsInternal(
BROTLI_LOG(("[ProcessCommandsInternal] dictionary word: [%.*s],"
" transform_idx = %d, transformed: [%.*s]\n",
i, word, transform_idx, len, &s->ringbuffer[pos]));
if (len == 0 && s->distance_code <= 120) {
BROTLI_LOG(("Invalid length-0 dictionary word after transform\n"));
return BROTLI_FAILURE(BROTLI_DECODER_ERROR_FORMAT_TRANSFORM);
}
}
pos += len;
s->meta_block_remaining_len -= len;
Expand Down Expand Up @@ -2483,6 +2652,11 @@ BrotliDecoderResult BrotliDecoderDecompressStream(
s->max_distance = s->max_backward_distance;
}
if (s->state == BROTLI_STATE_COMMAND_POST_WRITE_1) {
BrotliDecoderCompoundDictionary* addon = s->compound_dictionary;
if (addon && (addon->br_length != addon->br_copied)) {
s->pos += CopyFromCompoundDictionary(s, s->pos);
if (s->pos >= s->ringbuffer_size) continue;
}
if (s->meta_block_remaining_len == 0) {
/* Next metablock, if any. */
s->state = BROTLI_STATE_METABLOCK_DONE;
Expand Down
10 changes: 8 additions & 2 deletions c/dec/state.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <stdlib.h> /* free, malloc */

#include "../common/dictionary.h"
#include <brotli/types.h>
#include "./huffman.h"

Expand Down Expand Up @@ -81,8 +82,10 @@ BROTLI_BOOL BrotliDecoderStateInit(BrotliDecoderState* s,

s->mtf_upper_bound = 63;

s->dictionary = BrotliGetDictionary();
s->transforms = BrotliGetTransforms();
s->compound_dictionary = NULL;
s->dictionary =
BrotliSharedDictionaryCreateInstance(alloc_func, free_func, opaque);
if (!s->dictionary) return BROTLI_FALSE;

return BROTLI_TRUE;
}
Expand Down Expand Up @@ -129,6 +132,9 @@ void BrotliDecoderStateCleanupAfterMetablock(BrotliDecoderState* s) {
void BrotliDecoderStateCleanup(BrotliDecoderState* s) {
BrotliDecoderStateCleanupAfterMetablock(s);

BROTLI_DECODER_FREE(s, s->compound_dictionary);
BrotliSharedDictionaryDestroyInstance(s->dictionary);
s->dictionary = NULL;
BROTLI_DECODER_FREE(s, s->ringbuffer);
BROTLI_DECODER_FREE(s, s->block_type_trees);
}
Expand Down
19 changes: 17 additions & 2 deletions c/dec/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "../common/constants.h"
#include "../common/dictionary.h"
#include "../common/platform.h"
#include <brotli/shared_dictionary.h>
#include "../common/transform.h"
#include <brotli/types.h>
#include "./bit_reader.h"
Expand Down Expand Up @@ -189,6 +190,20 @@ typedef enum {
BROTLI_STATE_READ_BLOCK_LENGTH_SUFFIX
} BrotliRunningReadBlockLengthState;

/* BrotliDecoderState addon, used for Compound Dictionary functionality. */
typedef struct BrotliDecoderCompoundDictionary {
int num_chunks;
int total_size;
int br_index;
int br_offset;
int br_length;
int br_copied;
const uint8_t* chunks[16];
int chunk_offsets[16];
int block_bits;
uint8_t block_map[256];
} BrotliDecoderCompoundDictionary;

typedef struct BrotliMetablockHeaderArena {
BrotliRunningTreeGroupState substate_tree_group;
BrotliRunningContextMapState substate_context_map;
Expand Down Expand Up @@ -327,8 +342,8 @@ struct BrotliDecoderStateStruct {
uint8_t* context_map;
uint8_t* context_modes;

const BrotliDictionary* dictionary;
const BrotliTransforms* transforms;
BrotliSharedDictionary* dictionary;
BrotliDecoderCompoundDictionary* compound_dictionary;

uint32_t trivial_literal_contexts[8]; /* 256 bits */

Expand Down
Loading

1 comment on commit 19d86fb

@markmi
Copy link

@markmi markmi commented on 19d86fb Nov 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When BrotliDecoderAttachDictionary and BrotliEncoderPrepareDictionary were added in the above, a .h vs. .c VLA vs. pointer usage mismatch was added but has not been fixed. Looks like checking for this might be a good idea if there is a reasonable way to do so. (There have been past mismatches that were later fixed.)

See: #893 for an example that was applied: 0a3944c .

Looks like #929 ( 27dd726 ) is a pull request for the specifics here.

Please sign in to comment.