Skip to content

Commit

Permalink
chunked: improve function to merge chunks
Browse files Browse the repository at this point in the history
improve the function that combines neighbor chunks.  Instead of using
the number of parts, which also includes local files, use only the
number of chunks that must be retrieved from the network.

In addition, introduce a threshold limit to merge chunks so that we
further reduce the number of requested ranges.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
  • Loading branch information
giuseppe committed Feb 28, 2024
1 parent a083950 commit 631f5e8
Showing 1 changed file with 53 additions and 31 deletions.
84 changes: 53 additions & 31 deletions pkg/chunked/storage_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (

const (
maxNumberMissingChunks = 1024
autoMergePartsThreshold = 128 // if the gap between two ranges is below this threshold, automatically merge them.
newFileFlags = (unix.O_CREAT | unix.O_TRUNC | unix.O_EXCL | unix.O_WRONLY)
containersOverrideXattr = "user.containers.override_stat"
bigDataKey = "zstd-chunked-manifest"
Expand Down Expand Up @@ -1184,18 +1185,8 @@ func mergeMissingChunks(missingParts []missingPart, target int) []missingPart {
prev := missingParts[i-1].SourceChunk.Offset + missingParts[i-1].SourceChunk.Length
return int(missingParts[i].SourceChunk.Offset - prev)
}
getCost := func(missingParts []missingPart, i int) int {
cost := getGap(missingParts, i)
if missingParts[i-1].OriginFile != nil {
cost += int(missingParts[i-1].SourceChunk.Length)
}
if missingParts[i].OriginFile != nil {
cost += int(missingParts[i].SourceChunk.Length)
}
return cost
}

// simple case: merge chunks from the same file.
// simple case: merge chunks from the same file. Useful to reduce the number of parts to work with later.
newMissingParts := missingParts[0:1]
prevIndex := 0
for i := 1; i < len(missingParts); i++ {
Expand All @@ -1215,28 +1206,52 @@ func mergeMissingChunks(missingParts []missingPart, target int) []missingPart {
}
missingParts = newMissingParts

if len(missingParts) <= target {
return missingParts
}

// this implementation doesn't account for duplicates, so it could merge
// more than necessary to reach the specified target. Since target itself
// is a heuristic value, it doesn't matter.
costs := make([]int, len(missingParts)-1)
for i := 1; i < len(missingParts); i++ {
costs[i-1] = getCost(missingParts, i)
}
sort.Ints(costs)

toShrink := len(missingParts) - target
if toShrink >= len(costs) {
toShrink = len(costs) - 1
type gap struct {
from int
to int
cost uint64
}
var requestGaps []gap
lastOffset := int(-1)
numberSourceChunks := 0
for i, c := range missingParts {
if c.OriginFile != nil || c.Hole {
// it does not require a network request
continue
}
numberSourceChunks++
if lastOffset >= 0 {
prevEnd := missingParts[lastOffset].SourceChunk.Offset + missingParts[lastOffset].SourceChunk.Length
cost := c.SourceChunk.Offset - prevEnd
g := gap{
from: lastOffset,
to: i,
cost: cost,
}
requestGaps = append(requestGaps, g)
}
lastOffset = i
}
sort.Slice(requestGaps, func(i, j int) bool {
return requestGaps[i].cost < requestGaps[j].cost
})
toMergeMap := make([]bool, len(missingParts))
remainingToMerge := numberSourceChunks - target
totalCost := uint64(0)
for _, g := range requestGaps {
if remainingToMerge < 0 && g.cost > autoMergePartsThreshold {
continue
}
for i := g.from + 1; i <= g.to; i++ {
toMergeMap[i] = true
}
remainingToMerge--
totalCost += g.cost
}
targetValue := costs[toShrink]

newMissingParts = missingParts[0:1]
for i := 1; i < len(missingParts); i++ {
if getCost(missingParts, i) > targetValue {
if !toMergeMap[i] {
newMissingParts = append(newMissingParts, missingParts[i])
} else {
gap := getGap(missingParts, i)
Expand Down Expand Up @@ -1268,6 +1283,7 @@ func (c *chunkedDiffer) retrieveMissingFiles(stream ImageSourceSeekable, dest st
}
}

missingParts = mergeMissingChunks(missingParts, maxNumberMissingChunks)
calculateChunksToRequest()

// There are some missing files. Prepare a multirange request for the missing chunks.
Expand All @@ -1281,7 +1297,14 @@ func (c *chunkedDiffer) retrieveMissingFiles(stream ImageSourceSeekable, dest st
}

if _, ok := err.(ErrBadRequest); ok {
requested := len(missingParts)
requested := 0
for _, c := range missingParts {
if c.OriginFile != nil || c.Hole {
// it does not require a network request
continue
}
requested++
}
// If the server cannot handle at least 64 chunks in a single request, just give up.
if requested < 64 {
return err
Expand Down Expand Up @@ -1999,7 +2022,6 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff
}
// There are some missing files. Prepare a multirange request for the missing chunks.
if len(missingParts) > 0 {
missingParts = mergeMissingChunks(missingParts, maxNumberMissingChunks)
if err := c.retrieveMissingFiles(stream, dest, dirfd, missingParts, options); err != nil {
return output, err
}
Expand Down

0 comments on commit 631f5e8

Please sign in to comment.