diff --git a/pack/daggen/directory.go b/pack/daggen/directory.go index 4b9c09899..744bd91a0 100644 --- a/pack/daggen/directory.go +++ b/pack/daggen/directory.go @@ -3,6 +3,7 @@ package daggen import ( "bytes" "context" + "sort" "github.com/cockroachdb/errors" "github.com/data-preservation-programs/singularity/model" @@ -356,18 +357,35 @@ func UnmarshalToBlocks(in []byte) ([]blocks.Block, error) { return nil, errors.WithStack(err) } + // Iterate Reals and Additional in CID-sorted order so the resulting + // CAR layout is deterministic across runs. Go map iteration is + // randomized; without a sort, the same DAG produces a different piece + // CID on every regeneration even though the root CID is stable. blks := make([]blocks.Block, 0, len(data.Reals)+len(data.Additional)) - for c, d := range data.Reals { - blk, _ := blocks.NewBlockWithCid(d, c) + for _, c := range sortedCids(data.Reals) { + blk, _ := blocks.NewBlockWithCid(data.Reals[c], c) blks = append(blks, blk) } - for c, d := range data.Additional { - blk, _ := blocks.NewBlockWithCid(d, c) + for _, c := range sortedCids(data.Additional) { + blk, _ := blocks.NewBlockWithCid(data.Additional[c], c) blks = append(blks, blk) } return blks, nil } +// sortedCids returns the keys of a cid→bytes map in lexicographic order +// (by CID bytes). Used to make CAR layouts deterministic. +func sortedCids(m map[cid.Cid][]byte) []cid.Cid { + keys := make([]cid.Cid, 0, len(m)) + for c := range m { + keys = append(keys, c) + } + sort.Slice(keys, func(i, j int) bool { + return bytes.Compare(keys[i].Bytes(), keys[j].Bytes()) < 0 + }) + return keys +} + // UnmarshalBinary deserializes binary data into the current DirectoryData object. // This method: // 1. Creates a new blockstore and DAG service. diff --git a/pack/daggen/directory_test.go b/pack/daggen/directory_test.go index 4613a13c0..a968da358 100644 --- a/pack/daggen/directory_test.go +++ b/pack/daggen/directory_test.go @@ -216,3 +216,46 @@ func TestResolveDirectoryTree(t *testing.T) { require.Equal(t, "name", node.Links()[0].Name) require.Equal(t, "test", node.Links()[1].Name) } + +// TestUnmarshalToBlocksDeterministic verifies that UnmarshalToBlocks returns +// blocks in a stable, content-derived order across calls. Without this, Go's +// randomized map iteration leaks into the CAR layout, producing a different +// piece CID on every regeneration even when the underlying DAG is identical. +func TestUnmarshalToBlocksDeterministic(t *testing.T) { + ctx := context.Background() + + // Build a directoryData with many real blocks. We populate Additional + // directly via AddBlocks because AddFile creates dummy nodes that get + // filtered out by UnmarshalToBlocks. + dirData := NewDirectoryData() + const blockCount = 50 + for i := 0; i < blockCount; i++ { + c := cid.NewCidV1(cid.Raw, util.Hash([]byte(strconv.Itoa(i)))) + dirData.additional[c] = []byte("block-data-" + strconv.Itoa(i)) + } + + marshaled, err := dirData.MarshalBinary(ctx) + require.NoError(t, err) + + // Unmarshal multiple times and compare the block CID sequences. Each + // call must produce the same ordering or the resulting CAR file will + // be byte-different and the piece CID will drift. + first, err := UnmarshalToBlocks(marshaled) + require.NoError(t, err) + require.GreaterOrEqual(t, len(first), blockCount, "expected at least %d blocks", blockCount) + + firstCids := make([]string, len(first)) + for i, blk := range first { + firstCids[i] = blk.Cid().String() + } + + for run := 0; run < 20; run++ { + blks, err := UnmarshalToBlocks(marshaled) + require.NoError(t, err) + require.Len(t, blks, len(first)) + for i, blk := range blks { + require.Equal(t, firstCids[i], blk.Cid().String(), + "block order must be deterministic across UnmarshalToBlocks calls (run %d, position %d)", run, i) + } + } +}