From 1fafeb0df6d0eb31d468f86c29c462ceeb0cb842 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 23 Feb 2021 20:26:39 +0000 Subject: [PATCH 1/3] Extract out the common cat-file batch calls Signed-off-by: Andrew Thornton --- modules/git/command_batch.go | 42 ++++++++++++++++++++++ modules/git/commit_info_nogogit.go | 25 ++----------- modules/git/pipeline/lfs_nogogit.go | 23 ++---------- modules/git/repo_language_stats_nogogit.go | 27 ++------------ 4 files changed, 48 insertions(+), 69 deletions(-) create mode 100644 modules/git/command_batch.go diff --git a/modules/git/command_batch.go b/modules/git/command_batch.go new file mode 100644 index 000000000000..68c3d114e50a --- /dev/null +++ b/modules/git/command_batch.go @@ -0,0 +1,42 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package git + +import ( + "bufio" + "io" + "strings" +) + +// CatFileBatch opens git cat-file --batch in the provided repo and returns a stdin pipe, a stdout reader and cancel function +func CatFileBatch(repoPath string) (*io.PipeWriter, *bufio.Reader, func()) { + // Next feed the commits in order into cat-file --batch, followed by their trees and sub trees as necessary. + // so let's create a batch stdin and stdout + batchStdinReader, batchStdinWriter := io.Pipe() + batchStdoutReader, batchStdoutWriter := io.Pipe() + cancel := func() { + _ = batchStdinReader.Close() + _ = batchStdinWriter.Close() + _ = batchStdoutReader.Close() + _ = batchStdoutWriter.Close() + } + + go func() { + stderr := strings.Builder{} + err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repoPath, batchStdoutWriter, &stderr, batchStdinReader) + if err != nil { + _ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) + _ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String())) + } else { + _ = batchStdoutWriter.Close() + _ = batchStdinReader.Close() + } + }() + + // For simplicities sake we'll us a buffered reader to read from the cat-file --batch + batchReader := bufio.NewReader(batchStdoutReader) + + return batchStdinWriter, batchReader, cancel +} diff --git a/modules/git/commit_info_nogogit.go b/modules/git/commit_info_nogogit.go index ac0c7cff5dca..b844468c8c41 100644 --- a/modules/git/commit_info_nogogit.go +++ b/modules/git/commit_info_nogogit.go @@ -141,29 +141,8 @@ func GetLastCommitForPaths(commit *Commit, treePath string, paths []string) ([]* } }() - // We feed the commits in order into cat-file --batch, followed by their trees and sub trees as necessary. - // so let's create a batch stdin and stdout - batchStdinReader, batchStdinWriter := io.Pipe() - batchStdoutReader, batchStdoutWriter := io.Pipe() - defer func() { - _ = batchStdinReader.Close() - _ = batchStdinWriter.Close() - _ = batchStdoutReader.Close() - _ = batchStdoutWriter.Close() - }() - - go func() { - stderr := strings.Builder{} - err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(commit.repo.Path, batchStdoutWriter, &stderr, batchStdinReader) - if err != nil { - _ = revListWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) - } else { - _ = revListWriter.Close() - } - }() - - // For simplicities sake we'll us a buffered reader - batchReader := bufio.NewReader(batchStdoutReader) + batchStdinWriter, batchReader, cancel := CatFileBatch(commit.repo.Path) + defer cancel() mapsize := 4096 if len(paths) > mapsize { diff --git a/modules/git/pipeline/lfs_nogogit.go b/modules/git/pipeline/lfs_nogogit.go index 30d33e27e037..f6faa3a48aa1 100644 --- a/modules/git/pipeline/lfs_nogogit.go +++ b/modules/git/pipeline/lfs_nogogit.go @@ -64,27 +64,8 @@ func FindLFSFile(repo *git.Repository, hash git.SHA1) ([]*LFSResult, error) { // Next feed the commits in order into cat-file --batch, followed by their trees and sub trees as necessary. // so let's create a batch stdin and stdout - batchStdinReader, batchStdinWriter := io.Pipe() - batchStdoutReader, batchStdoutWriter := io.Pipe() - defer func() { - _ = batchStdinReader.Close() - _ = batchStdinWriter.Close() - _ = batchStdoutReader.Close() - _ = batchStdoutWriter.Close() - }() - - go func() { - stderr := strings.Builder{} - err := git.NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader) - if err != nil { - _ = revListWriter.CloseWithError(git.ConcatenateError(err, (&stderr).String())) - } else { - _ = revListWriter.Close() - } - }() - - // For simplicities sake we'll us a buffered reader to read from the cat-file --batch - batchReader := bufio.NewReader(batchStdoutReader) + batchStdinWriter, batchReader, cancel := git.CatFileBatch(repo.Path) + defer cancel() // We'll use a scanner for the revList because it's simpler than a bufio.Reader scan := bufio.NewScanner(revListReader) diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 4c6f07f0fba5..a929d7953b54 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -11,7 +11,6 @@ import ( "bytes" "io" "math" - "strings" "code.gitea.io/gitea/modules/analyze" @@ -22,30 +21,8 @@ import ( func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. // so let's create a batch stdin and stdout - - batchStdinReader, batchStdinWriter := io.Pipe() - batchStdoutReader, batchStdoutWriter := io.Pipe() - defer func() { - _ = batchStdinReader.Close() - _ = batchStdinWriter.Close() - _ = batchStdoutReader.Close() - _ = batchStdoutWriter.Close() - }() - - go func() { - stderr := strings.Builder{} - err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader) - if err != nil { - _ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) - _ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String())) - } else { - _ = batchStdoutWriter.Close() - _ = batchStdinReader.Close() - } - }() - - // For simplicities sake we'll us a buffered reader - batchReader := bufio.NewReader(batchStdoutReader) + batchStdinWriter, batchReader, cancel := CatFileBatch(repo.Path) + defer cancel() writeID := func(id string) error { _, err := batchStdinWriter.Write([]byte(id)) From 2ce696708363c0c745c567e9621cda9840c57a01 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 23 Feb 2021 20:56:46 +0000 Subject: [PATCH 2/3] Move bleve and elastic indexers to use a common cat-file --batch when indexing Signed-off-by: Andrew Thornton --- modules/indexer/code/bleve.go | 30 ++++++++++++++++---- modules/indexer/code/elastic_search.go | 38 +++++++++++++++++++------- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go index 1ebc74c43aa5..573ea8b88cbc 100644 --- a/modules/indexer/code/bleve.go +++ b/modules/indexer/code/bleve.go @@ -5,7 +5,10 @@ package code import ( + "bufio" "fmt" + "io" + "io/ioutil" "os" "strconv" "strings" @@ -173,7 +176,7 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) { return indexer, created, err } -func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { +func (b *BleveIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { // Ignore vendored files in code search if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { return nil @@ -196,8 +199,16 @@ func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *mode return b.addDelete(update.Filename, repo, batch) } - fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha). - RunInDirBytes(repo.RepoPath()) + if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { + return err + } + + _, _, size, err := git.ReadBatchLine(batchReader) + if err != nil { + return err + } + + fileContents, err := ioutil.ReadAll(io.LimitReader(batchReader, size)) if err != nil { return err } else if !base.IsTextFile(fileContents) { @@ -254,10 +265,17 @@ func (b *BleveIndexer) Close() { // Index indexes the data func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error { batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize) - for _, update := range changes.Updates { - if err := b.addUpdate(sha, update, repo, batch); err != nil { - return err + if len(changes.Updates) > 0 { + + batchWriter, batchReader, cancel := git.CatFileBatch(repo.RepoPath()) + defer cancel() + + for _, update := range changes.Updates { + if err := b.addUpdate(batchWriter, batchReader, sha, update, repo, batch); err != nil { + return err + } } + cancel() } for _, filename := range changes.RemovedFilenames { if err := b.addDelete(filename, repo, batch); err != nil { diff --git a/modules/indexer/code/elastic_search.go b/modules/indexer/code/elastic_search.go index c9d604b694b8..434a073d9eae 100644 --- a/modules/indexer/code/elastic_search.go +++ b/modules/indexer/code/elastic_search.go @@ -5,9 +5,12 @@ package code import ( + "bufio" "context" "encoding/json" "fmt" + "io" + "io/ioutil" "strconv" "strings" "time" @@ -172,7 +175,7 @@ func (b *ElasticSearchIndexer) init() (bool, error) { return exists, nil } -func (b *ElasticSearchIndexer) addUpdate(sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) { +func (b *ElasticSearchIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) { // Ignore vendored files in code search if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { return nil, nil @@ -195,8 +198,16 @@ func (b *ElasticSearchIndexer) addUpdate(sha string, update fileUpdate, repo *mo return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil } - fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha). - RunInDirBytes(repo.RepoPath()) + if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { + return nil, err + } + + _, _, size, err := git.ReadBatchLine(batchReader) + if err != nil { + return nil, err + } + + fileContents, err := ioutil.ReadAll(io.LimitReader(batchReader, size)) if err != nil { return nil, err } else if !base.IsTextFile(fileContents) { @@ -230,14 +241,21 @@ func (b *ElasticSearchIndexer) addDelete(filename string, repo *models.Repositor // Index will save the index data func (b *ElasticSearchIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error { reqs := make([]elastic.BulkableRequest, 0) - for _, update := range changes.Updates { - updateReqs, err := b.addUpdate(sha, update, repo) - if err != nil { - return err - } - if len(updateReqs) > 0 { - reqs = append(reqs, updateReqs...) + if len(changes.Updates) > 0 { + + batchWriter, batchReader, cancel := git.CatFileBatch(repo.RepoPath()) + defer cancel() + + for _, update := range changes.Updates { + updateReqs, err := b.addUpdate(batchWriter, batchReader, sha, update, repo) + if err != nil { + return err + } + if len(updateReqs) > 0 { + reqs = append(reqs, updateReqs...) + } } + cancel() } for _, filename := range changes.RemovedFilenames { From ddc68e28e85a67a99e819d483e3819cb972fd436 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 23 Feb 2021 21:29:12 +0000 Subject: [PATCH 3/3] move catfilebatch to batch_reader and rename to batch_reader.go Signed-off-by: Andrew Thornton --- ...atch_reader_nogogit.go => batch_reader.go} | 35 +++++++++++++++- modules/git/command_batch.go | 42 ------------------- 2 files changed, 33 insertions(+), 44 deletions(-) rename modules/git/{batch_reader_nogogit.go => batch_reader.go} (83%) delete mode 100644 modules/git/command_batch.go diff --git a/modules/git/batch_reader_nogogit.go b/modules/git/batch_reader.go similarity index 83% rename from modules/git/batch_reader_nogogit.go rename to modules/git/batch_reader.go index 6a236e50027b..6014508b9300 100644 --- a/modules/git/batch_reader_nogogit.go +++ b/modules/git/batch_reader.go @@ -2,17 +2,48 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. -// +build !gogit - package git import ( "bufio" "bytes" + "io" "math" "strconv" + "strings" ) +// CatFileBatch opens git cat-file --batch in the provided repo and returns a stdin pipe, a stdout reader and cancel function +func CatFileBatch(repoPath string) (*io.PipeWriter, *bufio.Reader, func()) { + // Next feed the commits in order into cat-file --batch, followed by their trees and sub trees as necessary. + // so let's create a batch stdin and stdout + batchStdinReader, batchStdinWriter := io.Pipe() + batchStdoutReader, batchStdoutWriter := io.Pipe() + cancel := func() { + _ = batchStdinReader.Close() + _ = batchStdinWriter.Close() + _ = batchStdoutReader.Close() + _ = batchStdoutWriter.Close() + } + + go func() { + stderr := strings.Builder{} + err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repoPath, batchStdoutWriter, &stderr, batchStdinReader) + if err != nil { + _ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) + _ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String())) + } else { + _ = batchStdoutWriter.Close() + _ = batchStdinReader.Close() + } + }() + + // For simplicities sake we'll us a buffered reader to read from the cat-file --batch + batchReader := bufio.NewReader(batchStdoutReader) + + return batchStdinWriter, batchReader, cancel +} + // ReadBatchLine reads the header line from cat-file --batch // We expect: // SP SP LF diff --git a/modules/git/command_batch.go b/modules/git/command_batch.go deleted file mode 100644 index 68c3d114e50a..000000000000 --- a/modules/git/command_batch.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2021 The Gitea Authors. All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package git - -import ( - "bufio" - "io" - "strings" -) - -// CatFileBatch opens git cat-file --batch in the provided repo and returns a stdin pipe, a stdout reader and cancel function -func CatFileBatch(repoPath string) (*io.PipeWriter, *bufio.Reader, func()) { - // Next feed the commits in order into cat-file --batch, followed by their trees and sub trees as necessary. - // so let's create a batch stdin and stdout - batchStdinReader, batchStdinWriter := io.Pipe() - batchStdoutReader, batchStdoutWriter := io.Pipe() - cancel := func() { - _ = batchStdinReader.Close() - _ = batchStdinWriter.Close() - _ = batchStdoutReader.Close() - _ = batchStdoutWriter.Close() - } - - go func() { - stderr := strings.Builder{} - err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repoPath, batchStdoutWriter, &stderr, batchStdinReader) - if err != nil { - _ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) - _ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String())) - } else { - _ = batchStdoutWriter.Close() - _ = batchStdinReader.Close() - } - }() - - // For simplicities sake we'll us a buffered reader to read from the cat-file --batch - batchReader := bufio.NewReader(batchStdoutReader) - - return batchStdinWriter, batchReader, cancel -}