From beae1146bda7e9fa712c3c73dacbcf9ed5067fd9 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Wed, 3 Apr 2024 19:28:30 +0000 Subject: [PATCH] Optimize access for scanning trees Right now, if the user is using partial clone, our call to `git ls-tree` against HEAD is expensive because `git ls-tree` needs to download each blob, which it does incrementally instead of all at once. If we're scanning the tree from HEAD, then we can avoid the expense of doing this by running `git ls-files` with a pattern that matches only LFS files, which makes the operation much cheaper, since we avoid needing to download blobs for many of those objects. We can format the data such that it matches the pattern we expect for `git ls-tree` so that we can avoid modifying most of the calls and continue to let things function in the same way. Do so, but limit our changes to Git 2.42.0 and newer, since the `objecttype` argument is new in that version. --- commands/command_checkout.go | 2 +- commands/command_pull.go | 2 +- git/git.go | 12 +++++++++ lfs/gitscanner.go | 16 +++++++++++ lfs/gitscanner_tree.go | 51 +++++++++++++++++++++++++++++++++++- 5 files changed, 80 insertions(+), 3 deletions(-) diff --git a/commands/command_checkout.go b/commands/command_checkout.go index 6a5e136ecc..6bf9534ceb 100644 --- a/commands/command_checkout.go +++ b/commands/command_checkout.go @@ -73,7 +73,7 @@ func checkoutCommand(cmd *cobra.Command, args []string) { chgitscanner.Filter = filepathfilter.New(rootedPaths(args), nil, filepathfilter.GitIgnore) - if err := chgitscanner.ScanTree(ref.Sha, nil); err != nil { + if err := chgitscanner.ScanLFSFiles(ref.Sha, nil); err != nil { ExitWithError(err) } diff --git a/commands/command_pull.go b/commands/command_pull.go index bbe1f53905..cbfdc0e7fb 100644 --- a/commands/command_pull.go +++ b/commands/command_pull.go @@ -87,7 +87,7 @@ func pull(filter *filepathfilter.Filter) { }() processQueue := time.Now() - if err := gitscanner.ScanTree(ref.Sha, nil); err != nil { + if err := gitscanner.ScanLFSFiles(ref.Sha, nil); err != nil { singleCheckout.Close() ExitWithError(err) } diff --git a/git/git.go b/git/git.go index 9afe71a811..74bdf48df2 100644 --- a/git/git.go +++ b/git/git.go @@ -316,6 +316,18 @@ func LsTree(ref string) (*subprocess.BufferedCmd, error) { ) } +func LsFilesLFS() (*subprocess.BufferedCmd, error) { + // This requires Git 2.42.0 for `--format` with `objecttype`. + return gitNoLFSBuffered( + "ls-files", + "--cached", + "--full-name", + "-z", + "--format=%(objectmode) %(objecttype) %(objectname) %(objectsize)\t%(path)", + ":(top,attr:filter=lfs)", + ) +} + func ResolveRef(ref string) (*Ref, error) { outp, err := gitNoLFSSimple("rev-parse", ref, "--symbolic-full-name", ref) if err != nil { diff --git a/lfs/gitscanner.go b/lfs/gitscanner.go index 764aa0f1ae..6259e3c3f0 100644 --- a/lfs/gitscanner.go +++ b/lfs/gitscanner.go @@ -219,6 +219,22 @@ func (s *GitScanner) ScanTree(ref string, cb GitScannerFoundPointer) error { return err } +// ScanLFSFiles takes a ref, which points to HEAD, and returns WrappedPointer +// objects in the index or tree at that ref. Differs from ScanRefs in that +// multiple files in the tree with the same content are all reported. +func (s *GitScanner) ScanLFSFiles(ref string, cb GitScannerFoundPointer) error { + callback, err := firstGitScannerCallback(cb, s.foundPointer) + if err != nil { + return err + } + + start := time.Now() + err = runScanLFSFiles(callback, ref, s.Filter, s.cfg.GitEnv(), s.cfg.OSEnv()) + tracerx.PerformanceSince("ScanLFSFiles", start) + + return err +} + // ScanUnpushed scans history for all LFS pointers which have been added but not // pushed to the named remote. remote can be left blank to mean 'any remote'. func (s *GitScanner) ScanUnpushed(remote string, cb GitScannerFoundPointer) error { diff --git a/lfs/gitscanner_tree.go b/lfs/gitscanner_tree.go index 247f14b5e9..1fdf50c5a5 100644 --- a/lfs/gitscanner_tree.go +++ b/lfs/gitscanner_tree.go @@ -10,6 +10,7 @@ import ( "github.com/git-lfs/git-lfs/v3/filepathfilter" "github.com/git-lfs/git-lfs/v3/git" "github.com/git-lfs/git-lfs/v3/git/gitattr" + "github.com/git-lfs/git-lfs/v3/subprocess" "github.com/git-lfs/git-lfs/v3/tr" ) @@ -38,6 +39,39 @@ func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.F return nil } +func runScanLFSFiles(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error { + var treeShas *TreeBlobChannelWrapper + var err error + if git.IsGitVersionAtLeast("2.42.0") { + treeShas, err = lsFilesBlobs(func(t *git.TreeBlob) bool { + return t != nil && t.Size < blobSizeCutoff && filter.Allows(t.Filename) + }) + } else { + treeShas, err = lsTreeBlobs(ref, func(t *git.TreeBlob) bool { + return t != nil && t.Size < blobSizeCutoff && filter.Allows(t.Filename) + }) + } + // We don't use the nameMap approach here since that's imprecise when >1 file + // can be using the same content + if err != nil { + return err + } + + pcw, err := catFileBatchTree(treeShas, gitEnv, osEnv) + if err != nil { + return err + } + + for p := range pcw.Results { + cb(p, nil) + } + + if err := pcw.Wait(); err != nil { + cb(nil, err) + } + return nil +} + // catFileBatchTree() uses an ObjectDatabase from the // github.com/git-lfs/gitobj/v2 package to get the contents of Git // blob objects, given their SHA1s from git.TreeBlob structs, similar @@ -98,7 +132,13 @@ func catFileBatchTree(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.En // The returned channel will be sent these blobs which should be sent to catFileBatchTree // for final check & conversion to Pointer func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChannelWrapper, error) { - cmd, err := git.LsTree(ref) + return lsBlobs(func() (*subprocess.BufferedCmd, error) { + return git.LsTree(ref) + }, predicate) +} + +func lsBlobs(backend func() (*subprocess.BufferedCmd, error), predicate func(*git.TreeBlob) bool) (*TreeBlobChannelWrapper, error) { + cmd, err := backend() if err != nil { return nil, err } @@ -128,6 +168,15 @@ func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChann return NewTreeBlobChannelWrapper(blobs, errchan), nil } +// Use ls-files at ref to find a list of candidate tree blobs which might be lfs files +// The returned channel will be sent these blobs which should be sent to catFileBatchTree +// for final check & conversion to Pointer +func lsFilesBlobs(predicate func(*git.TreeBlob) bool) (*TreeBlobChannelWrapper, error) { + return lsBlobs(func() (*subprocess.BufferedCmd, error) { + return git.LsFilesLFS() + }, predicate) +} + func catFileBatchTreeForPointers(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.Environment) (map[string]*WrappedPointer, *filepathfilter.Filter, error) { pscanner, err := NewPointerScanner(gitEnv, osEnv) if err != nil {