Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use cat-file --batch in GetLanguageStats #14685

Merged
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 103 additions & 30 deletions modules/git/repo_language_stats_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
package git

import (
"bufio"
"bytes"
"io"
"io/ioutil"
"math"
"strings"

"code.gitea.io/gitea/modules/analyze"

Expand All @@ -18,16 +20,63 @@ import (

// GetLanguageStats calculates language stats for git repository at specified commit
func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
// FIXME: We can be more efficient here...
//
// We're expecting that we will be reading a lot of blobs and the trees
// Thus we should use a shared `cat-file --batch` to get all of this data
// And keep the buffers around with resets as necessary.
//
// It's more complicated so...
commit, err := repo.GetCommit(commitID)
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
// so let's create a batch stdin and stdout

batchStdinReader, batchStdinWriter := io.Pipe()
batchStdoutReader, batchStdoutWriter := io.Pipe()
defer func() {
_ = batchStdinReader.Close()
_ = batchStdinWriter.Close()
_ = batchStdoutReader.Close()
_ = batchStdoutWriter.Close()
}()

go func() {
stderr := strings.Builder{}
err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader)
if err != nil {
_ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String()))
_ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String()))
} else {
_ = batchStdoutWriter.Close()
_ = batchStdinReader.Close()
}
}()

// For simplicities sake we'll us a buffered reader
batchReader := bufio.NewReader(batchStdoutReader)

writeID := func(id string) error {
_, err := batchStdinWriter.Write([]byte(id))
if err != nil {
return err
}
_, err = batchStdinWriter.Write([]byte{'\n'})
if err != nil {
return err
}
return nil
6543 marked this conversation as resolved.
Show resolved Hide resolved
}

if err := writeID(commitID); err != nil {
return nil, err
}
shaBytes, typ, size, err := ReadBatchLine(batchReader)
if typ != "commit" {
log("Unable to get commit for: %s. Err: %v", commitID, err)
return nil, ErrNotExist{commitID, ""}
}

sha, err := NewIDFromString(string(shaBytes))
if err != nil {
log("Unable to get commit for: %s", commitID)
log("Unable to get commit for: %s. Err: %v", commitID, err)
return nil, ErrNotExist{commitID, ""}
}

commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
if err != nil {
log("Unable to get commit for: %s. Err: %v", commitID, err)
return nil, err
}

Expand All @@ -38,17 +87,45 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
return nil, err
}

contentBuf := bytes.Buffer{}
var content []byte
sizes := make(map[string]int64)
for _, f := range entries {
contentBuf.Reset()
content = contentBuf.Bytes()
if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) ||
enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
continue
}

// If content can not be read or file is too big just do detection by filename
var content []byte

if f.Size() <= bigFileSize {
content, _ = readFile(f, fileSizeLimit)
if err := writeID(f.ID.String()); err != nil {
return nil, err
}
_, _, size, err := ReadBatchLine(batchReader)
if err != nil {
log("Error reading blob: %s Err: %v", f.ID.String(), err)
return nil, err
}

sizeToRead := size
discard := int64(0)
if size > fileSizeLimit {
sizeToRead = fileSizeLimit
discard = size - fileSizeLimit
}

_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
if err != nil {
return nil, err
}
content = contentBuf.Bytes()
err = discardFull(batchReader, discard)
if err != nil {
return nil, err
}
}
if enry.IsGenerated(f.Name(), content) {
continue
Expand Down Expand Up @@ -86,24 +163,20 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
return sizes, nil
}

func readFile(entry *TreeEntry, limit int64) ([]byte, error) {
// FIXME: We can probably be a little more efficient here... see above
r, err := entry.Blob().DataAsync()
if err != nil {
return nil, err
}
defer r.Close()

if limit <= 0 {
return ioutil.ReadAll(r)
func discardFull(rd *bufio.Reader, discard int64) error {
if discard > math.MaxInt32 {
n, err := rd.Discard(math.MaxInt32)
discard -= int64(n)
if err != nil {
return err
}
}

size := entry.Size()
if limit > 0 && size > limit {
size = limit
for discard > 0 {
n, err := rd.Discard(int(discard))
discard -= int64(n)
if err != nil {
return err
}
}
buf := bytes.NewBuffer(nil)
buf.Grow(int(size))
_, err = io.Copy(buf, io.LimitReader(r, limit))
return buf.Bytes(), err
return nil
}