diff --git a/blake3/chunk_state.go b/blake3/chunk_state.go index 49e7786..a77a124 100644 --- a/blake3/chunk_state.go +++ b/blake3/chunk_state.go @@ -34,7 +34,7 @@ func (cs *chunkState) length() int { return (BLAKE3_BLOCK_LEN * int(cs.blocks_compressed)) + (int(cs.buf_len)) } -func (cs *chunkState) fill_buf(input []byte) int { +func (cs *chunkState) fillBuf(input []byte) int { input_len := len(input) var take int = BLAKE3_BLOCK_LEN - int(cs.buf_len) @@ -49,7 +49,7 @@ func (cs *chunkState) fill_buf(input []byte) int { return take } -func (cs *chunkState) maybe_start_flag() uint8 { +func (cs *chunkState) maybeStartFlag() uint8 { if cs.blocks_compressed == 0 { return CHUNK_START } else { @@ -61,7 +61,7 @@ func (cs *chunkState) update(input []byte) { input_len := len(input) if cs.buf_len > 0 { - var take int = cs.fill_buf(input) + var take int = cs.fillBuf(input) input = input[take:] input_len -= take @@ -72,7 +72,7 @@ func (cs *chunkState) update(input []byte) { cs.buf, BLAKE3_BLOCK_LEN, cs.chunk_counter, - cs.flags | cs.maybe_start_flag(), + cs.flags | cs.maybeStartFlag(), ) cs.blocks_compressed += 1 @@ -89,7 +89,7 @@ func (cs *chunkState) update(input []byte) { tmp, BLAKE3_BLOCK_LEN, cs.chunk_counter, - cs.flags | cs.maybe_start_flag(), + cs.flags | cs.maybeStartFlag(), ) cs.blocks_compressed += 1 @@ -98,13 +98,13 @@ func (cs *chunkState) update(input []byte) { input_len -= BLAKE3_BLOCK_LEN } - var take int = cs.fill_buf(input) + var take int = cs.fillBuf(input) input = input[take:] input_len -= take } func (cs *chunkState) output() output { - var block_flags uint8 = cs.flags | cs.maybe_start_flag() | CHUNK_END + var block_flags uint8 = cs.flags | cs.maybeStartFlag() | CHUNK_END return newOutput( cs.cv, cs.buf, @@ -113,191 +113,3 @@ func (cs *chunkState) output() output { block_flags, ) } - -func parent_output(block [BLAKE3_BLOCK_LEN]byte, key [8]uint32, flags uint8) output { - return newOutput(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT) -} - -func left_len(content_len int) int { - var full_chunks int = (content_len - 1) / BLAKE3_CHUNK_LEN - return int(round_down_to_power_of_2(uint64(full_chunks))) * BLAKE3_CHUNK_LEN -} - -func compress_chunks_parallel( - input []byte, - key [8]uint32, - chunk_counter uint64, - flags uint8, -) (out []byte, array_len int) { - input_len := len(input) - - var chunks_array [MAX_SIMD_DEGREE][]byte - var input_position int = 0 - var chunks_array_len int = 0 - - for (input_len - input_position) >= BLAKE3_CHUNK_LEN { - chunks_array[chunks_array_len] = make([]byte, BLAKE3_CHUNK_LEN) - copy(chunks_array[chunks_array_len], input[input_position:]) - - input_position += BLAKE3_CHUNK_LEN - chunks_array_len += 1 - } - - buf := blake3_hash_many( - chunks_array[:], - chunks_array_len, - BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, - key, - chunk_counter, - true, - flags, - CHUNK_START, - CHUNK_END, - ) - - out = append(out, buf...) - - if input_len > input_position { - var counter uint64 = chunk_counter + uint64(chunks_array_len) - chunk_state := newChunkState(key, flags) - - chunk_state.chunk_counter = counter - chunk_state.update(input[input_position:]) - - tmp := [32]byte{} - - output := chunk_state.output() - output.chaining_value(&tmp); - - out = append(out, tmp[:]...) - - return out, chunks_array_len + 1 - } else { - return out, chunks_array_len - } -} - -func compress_parents_parallel( - child_chaining_values []byte, - num_chaining_values int, - key [8]uint32, - flags uint8, -) (out []byte, array_len int) { - var parents_array [MAX_SIMD_DEGREE_OR_2][]byte - var parents_array_len int = 0 - - for (num_chaining_values - (2 * parents_array_len)) >= 2 { - parents_array[parents_array_len] = make([]byte, 2 * BLAKE3_OUT_LEN) - copy(parents_array[parents_array_len], child_chaining_values[2 * BLAKE3_OUT_LEN * parents_array_len:]) - parents_array_len += 1 - } - - buf := blake3_hash_many( - parents_array[:], - parents_array_len, - 1, - key, - 0, - false, - flags | PARENT, - 0, - 0, - ) - - out = append(out, buf...) - - if num_chaining_values > 2 * parents_array_len { - out = append(out, child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN:2 * parents_array_len * BLAKE3_OUT_LEN+BLAKE3_OUT_LEN]...) - return out, parents_array_len + 1 - } else { - return out, parents_array_len - } -} - -func blake3_compress_subtree_wide( - input []byte, - key [8]uint32, - chunk_counter uint64, - flags uint8, -) (out []byte, array_len int) { - input_len := len(input) - - if input_len <= BLAKE3_CHUNK_LEN { - return compress_chunks_parallel( - input, - key, - chunk_counter, - flags, - ) - } - - var left_input_len int = left_len(input_len) - var right_input_len int = input_len - left_input_len - var right_input []byte = input[left_input_len:] - var right_chunk_counter uint64 = chunk_counter + uint64(left_input_len / BLAKE3_CHUNK_LEN) - - var cv_array [2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]byte - var degree int = 1 - if left_input_len > BLAKE3_CHUNK_LEN && degree == 1 { - degree = 2 - } - - left_cvs, left_n := blake3_compress_subtree_wide( - input[:left_input_len], - key, - chunk_counter, - flags, - ) - right_cvs, right_n := blake3_compress_subtree_wide( - right_input[:right_input_len], - key, - right_chunk_counter, - flags, - ) - - copy(cv_array[:], left_cvs) - copy(cv_array[degree * BLAKE3_OUT_LEN:], right_cvs) - - if left_n == 1 { - out = cv_array[:2 * BLAKE3_OUT_LEN] - return out, 2 - } - - var num_chaining_values int = left_n + right_n - return compress_parents_parallel( - cv_array[:], - num_chaining_values, - key, - flags, - ) -} - -func compress_subtree_to_parent_node( - input []byte, - key [8]uint32, - chunk_counter uint64, - flags uint8, -) (out [2 * BLAKE3_OUT_LEN]byte) { - var cv_array [MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]byte - cv_array_tmp, num_cvs := blake3_compress_subtree_wide( - input, - key, - chunk_counter, - flags, - ) - copy(cv_array[:], cv_array_tmp) - - var out_array [MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]byte - var out_array_tmp []byte - - for num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2 { - out_array_tmp, num_cvs = compress_parents_parallel(cv_array[:], num_cvs, key, flags) - copy(out_array[:], out_array_tmp) - copy(cv_array[:], out_array[:num_cvs * BLAKE3_OUT_LEN]) - } - - copy(out[:], cv_array[:2 * BLAKE3_OUT_LEN]) - - return -} - diff --git a/blake3/compress.go b/blake3/compress.go new file mode 100644 index 0000000..c702d0e --- /dev/null +++ b/blake3/compress.go @@ -0,0 +1,179 @@ +package blake3 + +func parent_output(block [BLAKE3_BLOCK_LEN]byte, key [8]uint32, flags uint8) output { + return newOutput(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT) +} + +func left_len(content_len int) int { + var full_chunks int = (content_len - 1) / BLAKE3_CHUNK_LEN + return int(round_down_to_power_of_2(uint64(full_chunks))) * BLAKE3_CHUNK_LEN +} + +func compress_chunks_parallel( + input []byte, + key [8]uint32, + chunk_counter uint64, + flags uint8, +) (out []byte, array_len int) { + input_len := len(input) + + var chunks_array [1][]byte + var input_position int = 0 + var chunks_array_len int = 0 + + for (input_len - input_position) >= BLAKE3_CHUNK_LEN { + chunks_array[chunks_array_len] = make([]byte, BLAKE3_CHUNK_LEN) + copy(chunks_array[chunks_array_len], input[input_position:]) + + input_position += BLAKE3_CHUNK_LEN + chunks_array_len += 1 + } + + buf := blake3_hash_many( + chunks_array[:], + chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, + key, + chunk_counter, + true, + flags, + CHUNK_START, + CHUNK_END, + ) + + out = append(out, buf...) + + if input_len > input_position { + var counter uint64 = chunk_counter + uint64(chunks_array_len) + chunk_state := newChunkState(key, flags) + + chunk_state.chunk_counter = counter + chunk_state.update(input[input_position:]) + + tmp := [32]byte{} + + output := chunk_state.output() + output.chainingValue(&tmp) + + out = append(out, tmp[:]...) + + return out, chunks_array_len + 1 + } else { + return out, chunks_array_len + } +} + +func compress_parents_parallel( + child_chaining_values []byte, + num_chaining_values int, + key [8]uint32, + flags uint8, +) (out []byte, array_len int) { + var parents_array [2][]byte + var parents_array_len int = 0 + + for (num_chaining_values - (2 * parents_array_len)) >= 2 { + parents_array[parents_array_len] = make([]byte, 2 * BLAKE3_OUT_LEN) + copy(parents_array[parents_array_len], child_chaining_values[2 * BLAKE3_OUT_LEN * parents_array_len:]) + parents_array_len += 1 + } + + buf := blake3_hash_many( + parents_array[:], + parents_array_len, + 1, + key, + 0, + false, + flags | PARENT, + 0, + 0, + ) + + out = append(out, buf...) + + if num_chaining_values > 2 * parents_array_len { + out = append(out, child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN:2 * parents_array_len * BLAKE3_OUT_LEN+BLAKE3_OUT_LEN]...) + return out, parents_array_len + 1 + } else { + return out, parents_array_len + } +} + +func compress_subtree_wide( + input []byte, + key [8]uint32, + chunk_counter uint64, + flags uint8, +) (out []byte, array_len int) { + input_len := len(input) + + if input_len <= BLAKE3_CHUNK_LEN { + return compress_chunks_parallel( + input, + key, + chunk_counter, + flags, + ) + } + + var left_input_len int = left_len(input_len) + var right_input_len int = input_len - left_input_len + var right_input []byte = input[left_input_len:] + var right_chunk_counter uint64 = chunk_counter + uint64(left_input_len / BLAKE3_CHUNK_LEN) + + var cv_array [2 * 2 * BLAKE3_OUT_LEN]byte + var degree int = 1 + if left_input_len > BLAKE3_CHUNK_LEN { + degree = 2 + } + + left_cvs, left_n := compress_subtree_wide( + input[:left_input_len], + key, + chunk_counter, + flags, + ) + right_cvs, right_n := compress_subtree_wide( + right_input[:right_input_len], + key, + right_chunk_counter, + flags, + ) + + copy(cv_array[:], left_cvs) + copy(cv_array[degree * BLAKE3_OUT_LEN:], right_cvs) + + if left_n == 1 { + out = cv_array[:2 * BLAKE3_OUT_LEN] + return out, 2 + } + + var num_chaining_values int = left_n + right_n + return compress_parents_parallel( + cv_array[:], + num_chaining_values, + key, + flags, + ) +} + +func compress_subtree_to_parent_node( + input []byte, + key [8]uint32, + chunk_counter uint64, + flags uint8, +) (out [2 * BLAKE3_OUT_LEN]byte) { + var cv_array [2 * BLAKE3_OUT_LEN]byte + cvs, _ := compress_subtree_wide( + input, + key, + chunk_counter, + flags, + ) + + copy(cv_array[:], cvs) + copy(out[:], cv_array[:2 * BLAKE3_OUT_LEN]) + + return +} diff --git a/blake3/digest.go b/blake3/digest.go index e8884d8..7650127 100644 --- a/blake3/digest.go +++ b/blake3/digest.go @@ -67,8 +67,8 @@ func (d *digest) Write(p []byte) (nn int, err error) { output := d.chunk.output() var chunk_cv [32]byte - output.chaining_value(&chunk_cv) - d.push_cv(chunk_cv, d.chunk.chunk_counter) + output.chainingValue(&chunk_cv) + d.pushCV(chunk_cv, d.chunk.chunk_counter) d.chunk.reset(d.key, d.chunk.chunk_counter + 1) } else { return @@ -93,8 +93,8 @@ func (d *digest) Write(p []byte) (nn int, err error) { output := chunk_state.output() var cv [BLAKE3_OUT_LEN]byte - output.chaining_value(&cv) - d.push_cv(cv, chunk_state.chunk_counter) + output.chainingValue(&cv) + d.pushCV(cv, chunk_state.chunk_counter) } else { var cv_pair [2 * BLAKE3_OUT_LEN]byte cv_pair = compress_subtree_to_parent_node( @@ -105,10 +105,10 @@ func (d *digest) Write(p []byte) (nn int, err error) { ) copy(tmp_cv[:], cv_pair[:]) - d.push_cv(tmp_cv, d.chunk.chunk_counter) + d.pushCV(tmp_cv, d.chunk.chunk_counter) copy(tmp_cv[:], cv_pair[BLAKE3_OUT_LEN:]) - d.push_cv(tmp_cv, d.chunk.chunk_counter + (subtree_chunks / 2)) + d.pushCV(tmp_cv, d.chunk.chunk_counter + (subtree_chunks / 2)) } d.chunk.chunk_counter += subtree_chunks @@ -118,7 +118,7 @@ func (d *digest) Write(p []byte) (nn int, err error) { if input_len > 0 { d.chunk.update(input_bytes[:input_len]) - d.merge_cv_stack(d.chunk.chunk_counter) + d.mergeCvStack(d.chunk.chunk_counter) } return @@ -135,12 +135,12 @@ func (d *digest) Sum(in []byte) []byte { func (d *digest) checkSum() (out []byte) { out = make([]byte, d.hs) - d.finalize_seek(0, out) + d.finalizeSeek(0, out) return } -func (d *digest) finalize_seek(seek uint64, out []byte) { +func (d *digest) finalizeSeek(seek uint64, out []byte) { out_len := len(out) if out_len == 0 { @@ -149,7 +149,7 @@ func (d *digest) finalize_seek(seek uint64, out []byte) { if d.cv_stack_len == 0 { output := d.chunk.output() - output.root_bytes(seek, out, out_len) + output.rootBytes(seek, out, out_len) return } @@ -177,16 +177,16 @@ func (d *digest) finalize_seek(seek uint64, out []byte) { copy(parent_block[:], d.cv_stack[cvs_remaining * 32 : cvs_remaining * 32 + 32]) - outbuf.chaining_value(&tmp) + outbuf.chainingValue(&tmp) copy(parent_block[32:], tmp[:]) outbuf = parent_output(parent_block, d.key, d.chunk.flags) } - outbuf.root_bytes(seek, out, out_len) + outbuf.rootBytes(seek, out, out_len) } -func (d *digest) merge_cv_stack(total_len uint64) { +func (d *digest) mergeCvStack(total_len uint64) { post_merge_stack_len := int(popcnt(total_len)) var parent_node [BLAKE3_BLOCK_LEN]byte @@ -195,7 +195,7 @@ func (d *digest) merge_cv_stack(total_len uint64) { output := parent_output(parent_node, d.key, d.chunk.flags) var cv [32]byte - output.chaining_value(&cv) + output.chainingValue(&cv) copy(d.cv_stack[(d.cv_stack_len - 2) * BLAKE3_OUT_LEN:(d.cv_stack_len - 2) * BLAKE3_OUT_LEN+BLAKE3_BLOCK_LEN], cv[:]) @@ -203,11 +203,11 @@ func (d *digest) merge_cv_stack(total_len uint64) { } } -func (d *digest) push_cv( +func (d *digest) pushCV( new_cv [BLAKE3_OUT_LEN]byte, chunk_counter uint64, ) { - d.merge_cv_stack(chunk_counter) + d.mergeCvStack(chunk_counter) copy(d.cv_stack[d.cv_stack_len * BLAKE3_OUT_LEN:], new_cv[:BLAKE3_OUT_LEN]) d.cv_stack_len += 1 } diff --git a/blake3/output.go b/blake3/output.go index 9f77c59..4a8127e 100644 --- a/blake3/output.go +++ b/blake3/output.go @@ -26,7 +26,7 @@ func newOutput( return ret } -func (o *output) chaining_value(cv *[32]byte) { +func (o *output) chainingValue(cv *[32]byte) { var cv_words [8]uint32 copy(cv_words[:], o.input_cv[:]) @@ -42,7 +42,7 @@ func (o *output) chaining_value(cv *[32]byte) { copy(cv[:], buf) } -func (o *output) root_bytes(seek uint64, out []byte, out_len int) { +func (o *output) rootBytes(seek uint64, out []byte, out_len int) { var output_block_counter uint64 = seek / 64; var offset_within_block int = int(seek % 64) var wide_buf [64]byte diff --git a/blake3/portable.go b/blake3/portable.go index 2cebeab..0192ceb 100644 --- a/blake3/portable.go +++ b/blake3/portable.go @@ -1,12 +1,12 @@ package blake3 func compress_pre( - state *[16]uint32, - cv [8]uint32, - block [BLAKE3_BLOCK_LEN]byte, + state *[16]uint32, + cv [8]uint32, + block [BLAKE3_BLOCK_LEN]byte, block_len uint8, - counter uint64, - flags uint8, + counter uint64, + flags uint8, ) { var block_words [16]uint32 block_words[0] = getu32(block[4 * 0:]) @@ -53,11 +53,11 @@ func compress_pre( } func blake3_compress_in_place( - cv *[8]uint32, - block [BLAKE3_BLOCK_LEN]byte, + cv *[8]uint32, + block [BLAKE3_BLOCK_LEN]byte, block_len uint8, - counter uint64, - flags uint8, + counter uint64, + flags uint8, ) { var state [16]uint32 compress_pre(&state, *cv, block, block_len, counter, flags) @@ -73,12 +73,12 @@ func blake3_compress_in_place( } func blake3_compress_xof( - cv [8]uint32, - block [BLAKE3_BLOCK_LEN]byte, + cv [8]uint32, + block [BLAKE3_BLOCK_LEN]byte, block_len uint8, - counter uint64, - flags uint8, - out *[64]byte, + counter uint64, + flags uint8, + out *[64]byte, ) { var state [16]uint32 compress_pre(&state, cv, block, block_len, counter, flags) @@ -102,13 +102,13 @@ func blake3_compress_xof( } func hash_one( - input []byte, - blocks int, - key [8]uint32, - counter uint64, - flags uint8, + input []byte, + blocks int, + key [8]uint32, + counter uint64, + flags uint8, flags_start uint8, - flags_end uint8, + flags_end uint8, ) (out [BLAKE3_OUT_LEN]byte) { var cv [8]uint32 copy(cv[:], key[:]) @@ -145,15 +145,15 @@ func hash_one( } func blake3_hash_many( - inputs [][]byte, - num_inputs int, - blocks int, - key [8]uint32, - counter uint64, + inputs [][]byte, + num_inputs int, + blocks int, + key [8]uint32, + counter uint64, increment_counter bool, - flags uint8, - flags_start uint8, - flags_end uint8, + flags uint8, + flags_start uint8, + flags_end uint8, ) (out []byte) { var block [BLAKE3_OUT_LEN]byte diff --git a/blake3/sbox.go b/blake3/sbox.go index f0619bb..e370044 100644 --- a/blake3/sbox.go +++ b/blake3/sbox.go @@ -5,8 +5,6 @@ const BLAKE3_OUT_LEN = 32 const BLAKE3_BLOCK_LEN = 64 const BLAKE3_CHUNK_LEN = 1024 const BLAKE3_MAX_DEPTH = 54 -const MAX_SIMD_DEGREE = 1 -const MAX_SIMD_DEGREE_OR_2 = 2 const ( CHUNK_START = 1 << 0 diff --git a/siphash/siphash.go b/siphash/siphash.go index 7e9405c..bba15fa 100644 --- a/siphash/siphash.go +++ b/siphash/siphash.go @@ -10,13 +10,13 @@ func New(k []byte) hash.Hash { return h } -// return 8 bytes +// New64 returns a new hash.Hash computing the Siphash checksum. func New64(k []byte) hash.Hash { h, _ := NewWithCDroundsAndHashSize(k, 0, 0, HashSize64) return h } -// New alias, return 16 bytes +// New128 returns a new hash.Hash computing the Siphash checksum. func New128(k []byte) hash.Hash { h, _ := NewWithCDroundsAndHashSize(k, 0, 0, HashSize128) return h