Skip to content

Commit

Permalink
Optimize access for scanning trees
Browse files Browse the repository at this point in the history
Right now, if the user is using partial clone, our call to `git ls-tree`
against HEAD is expensive because `git ls-tree` needs to download each
blob, which it does incrementally instead of all at once.

If we're scanning the tree from HEAD, then we can avoid the expense of
doing this by running `git ls-files` with a pattern that matches only
LFS files, which makes the operation much cheaper, since we avoid
needing to download blobs for many of those objects.  We can format the
data such that it matches the pattern we expect for `git ls-tree` so
that we can avoid modifying most of the calls and continue to let things
function in the same way.  Do so, but limit our changes to Git 2.42.0
and newer, since the `objecttype` argument is new in that version.
  • Loading branch information
bk2204 committed Apr 16, 2024
1 parent cdec9f2 commit beae114
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 3 deletions.
2 changes: 1 addition & 1 deletion commands/command_checkout.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func checkoutCommand(cmd *cobra.Command, args []string) {

chgitscanner.Filter = filepathfilter.New(rootedPaths(args), nil, filepathfilter.GitIgnore)

if err := chgitscanner.ScanTree(ref.Sha, nil); err != nil {
if err := chgitscanner.ScanLFSFiles(ref.Sha, nil); err != nil {
ExitWithError(err)
}

Expand Down
2 changes: 1 addition & 1 deletion commands/command_pull.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func pull(filter *filepathfilter.Filter) {
}()

processQueue := time.Now()
if err := gitscanner.ScanTree(ref.Sha, nil); err != nil {
if err := gitscanner.ScanLFSFiles(ref.Sha, nil); err != nil {
singleCheckout.Close()
ExitWithError(err)
}
Expand Down
12 changes: 12 additions & 0 deletions git/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,18 @@ func LsTree(ref string) (*subprocess.BufferedCmd, error) {
)
}

func LsFilesLFS() (*subprocess.BufferedCmd, error) {
// This requires Git 2.42.0 for `--format` with `objecttype`.
return gitNoLFSBuffered(
"ls-files",
"--cached",
"--full-name",
"-z",
"--format=%(objectmode) %(objecttype) %(objectname) %(objectsize)\t%(path)",
":(top,attr:filter=lfs)",
)
}

func ResolveRef(ref string) (*Ref, error) {
outp, err := gitNoLFSSimple("rev-parse", ref, "--symbolic-full-name", ref)
if err != nil {
Expand Down
16 changes: 16 additions & 0 deletions lfs/gitscanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,22 @@ func (s *GitScanner) ScanTree(ref string, cb GitScannerFoundPointer) error {
return err
}

// ScanLFSFiles takes a ref, which points to HEAD, and returns WrappedPointer
// objects in the index or tree at that ref. Differs from ScanRefs in that
// multiple files in the tree with the same content are all reported.
func (s *GitScanner) ScanLFSFiles(ref string, cb GitScannerFoundPointer) error {
callback, err := firstGitScannerCallback(cb, s.foundPointer)
if err != nil {
return err
}

start := time.Now()
err = runScanLFSFiles(callback, ref, s.Filter, s.cfg.GitEnv(), s.cfg.OSEnv())
tracerx.PerformanceSince("ScanLFSFiles", start)

return err
}

// ScanUnpushed scans history for all LFS pointers which have been added but not
// pushed to the named remote. remote can be left blank to mean 'any remote'.
func (s *GitScanner) ScanUnpushed(remote string, cb GitScannerFoundPointer) error {
Expand Down
51 changes: 50 additions & 1 deletion lfs/gitscanner_tree.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/git-lfs/git-lfs/v3/filepathfilter"
"github.com/git-lfs/git-lfs/v3/git"
"github.com/git-lfs/git-lfs/v3/git/gitattr"
"github.com/git-lfs/git-lfs/v3/subprocess"
"github.com/git-lfs/git-lfs/v3/tr"
)

Expand Down Expand Up @@ -38,6 +39,39 @@ func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.F
return nil
}

func runScanLFSFiles(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error {
var treeShas *TreeBlobChannelWrapper
var err error
if git.IsGitVersionAtLeast("2.42.0") {
treeShas, err = lsFilesBlobs(func(t *git.TreeBlob) bool {
return t != nil && t.Size < blobSizeCutoff && filter.Allows(t.Filename)
})
} else {
treeShas, err = lsTreeBlobs(ref, func(t *git.TreeBlob) bool {
return t != nil && t.Size < blobSizeCutoff && filter.Allows(t.Filename)
})
}
// We don't use the nameMap approach here since that's imprecise when >1 file
// can be using the same content
if err != nil {
return err
}

pcw, err := catFileBatchTree(treeShas, gitEnv, osEnv)
if err != nil {
return err
}

for p := range pcw.Results {
cb(p, nil)
}

if err := pcw.Wait(); err != nil {
cb(nil, err)
}
return nil
}

// catFileBatchTree() uses an ObjectDatabase from the
// github.com/git-lfs/gitobj/v2 package to get the contents of Git
// blob objects, given their SHA1s from git.TreeBlob structs, similar
Expand Down Expand Up @@ -98,7 +132,13 @@ func catFileBatchTree(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.En
// The returned channel will be sent these blobs which should be sent to catFileBatchTree
// for final check & conversion to Pointer
func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChannelWrapper, error) {
cmd, err := git.LsTree(ref)
return lsBlobs(func() (*subprocess.BufferedCmd, error) {
return git.LsTree(ref)
}, predicate)
}

func lsBlobs(backend func() (*subprocess.BufferedCmd, error), predicate func(*git.TreeBlob) bool) (*TreeBlobChannelWrapper, error) {
cmd, err := backend()
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -128,6 +168,15 @@ func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChann
return NewTreeBlobChannelWrapper(blobs, errchan), nil
}

// Use ls-files at ref to find a list of candidate tree blobs which might be lfs files
// The returned channel will be sent these blobs which should be sent to catFileBatchTree
// for final check & conversion to Pointer
func lsFilesBlobs(predicate func(*git.TreeBlob) bool) (*TreeBlobChannelWrapper, error) {
return lsBlobs(func() (*subprocess.BufferedCmd, error) {
return git.LsFilesLFS()
}, predicate)
}

func catFileBatchTreeForPointers(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.Environment) (map[string]*WrappedPointer, *filepathfilter.Filter, error) {
pscanner, err := NewPointerScanner(gitEnv, osEnv)
if err != nil {
Expand Down

0 comments on commit beae114

Please sign in to comment.