Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support elastic search for code search #10273

Merged
merged 11 commits into from Aug 30, 2020
1 change: 1 addition & 0 deletions .drone.yml
Expand Up @@ -209,6 +209,7 @@ steps:
TAGS: bindata
TEST_LDAP: 1
USE_REPO_TEST_DIR: 1
TEST_INDEXER_CODE_ES_URL: "http://elastic:changeme@elasticsearch:9200"
depends_on:
- build

Expand Down
8 changes: 8 additions & 0 deletions custom/conf/app.example.ini
Expand Up @@ -428,7 +428,15 @@ STARTUP_TIMEOUT=30s

; repo indexer by default disabled, since it uses a lot of disk space
REPO_INDEXER_ENABLED = false
; Code search engine type, could be `bleve` or `elasticsearch`.
REPO_INDEXER_TYPE = bleve
; Index file used for code search.
REPO_INDEXER_PATH = indexers/repos.bleve
; Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
REPO_INDEXER_CONN_STR =
; Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch
REPO_INDEXER_NAME = gitea_codes

UPDATE_BUFFER_LEN = 20
MAX_FILE_SIZE = 1048576
; A comma separated list of glob patterns (see https://github.com/gobwas/glob) to include
Expand Down
4 changes: 4 additions & 0 deletions docs/content/doc/advanced/config-cheat-sheet.en-us.md
Expand Up @@ -270,7 +270,11 @@ relation to port exhaustion.
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: Batch queue number.

- `REPO_INDEXER_ENABLED`: **false**: Enables code search (uses a lot of disk space, about 6 times more than the repository size).
- `REPO_INDEXER_TYPE`: **bleve**: Code search engine type, could be `bleve` or `elasticsearch`.
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
- `REPO_INDEXER_CONN_STR`: ****: Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
- `REPO_INDEXER_NAME`: **gitea_codes**: Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch

- `REPO_INDEXER_INCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **include** in the index. Use `**.txt` to match any files with .txt extension. An empty list means include all files.
- `REPO_INDEXER_EXCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **exclude** from the index. Files that match this list will not be indexed, even if they match in `REPO_INDEXER_INCLUDE`.
- `REPO_INDEXER_EXCLUDE_VENDORED`: **true**: Exclude vendored files from index.
Expand Down
6 changes: 5 additions & 1 deletion docs/content/doc/advanced/config-cheat-sheet.zh-cn.md
Expand Up @@ -98,8 +98,12 @@ menu:
- `ISSUE_INDEXER_QUEUE_CONN_STR`: **addrs=127.0.0.1:6379 db=0**: 当 `ISSUE_INDEXER_QUEUE_TYPE` 为 `redis` 时,保存Redis队列的连接字符串。
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: 队列处理中批量提交数量。

- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间)。
- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间,如果是bleve可能需要占用约6倍存储空间)。
- `REPO_INDEXER_TYPE`: **bleve**: 代码搜索引擎类型,可以为 `bleve` 或者 `elasticsearch`。
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: 用于代码搜索的索引文件路径。
- `REPO_INDEXER_CONN_STR`: ****: 代码搜索引擎连接字符串,当 `REPO_INDEXER_TYPE` 为 `elasticsearch` 时有效。例如: http://elastic:changeme@localhost:9200
- `REPO_INDEXER_NAME`: **gitea_codes**: 代码搜索引擎的名字,当 `REPO_INDEXER_TYPE` 为 `elasticsearch` 时有效。

- `UPDATE_BUFFER_LEN`: **20**: 代码索引请求的缓冲区长度。
- `MAX_FILE_SIZE`: **1048576**: 进行解析的源代码文件的最大长度,小于该值时才会索引。

Expand Down
132 changes: 51 additions & 81 deletions modules/indexer/code/bleve.go
Expand Up @@ -58,10 +58,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}

// openIndexer open the index at the specified path, checking for metadata
// openBleveIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, nil
Expand Down Expand Up @@ -104,54 +104,14 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}

func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return addDelete(update.Filename, repo, batch)
}

fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}

id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}

const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 5
)

// createRepoIndexer create a repo indexer if one does not already exist
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
// createBleveIndexer create a bleve repo indexer if one does not already exist
func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
Expand Down Expand Up @@ -199,18 +159,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
return indexer, nil
}

func filenameIndexerID(repoID int64, filename string) string {
return indexerID(repoID) + "_" + filename
}

func filenameOfIndexerID(indexerID string) string {
index := strings.IndexByte(indexerID, '_')
if index == -1 {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
return indexerID[index+1:]
}

var (
_ Indexer = &BleveIndexer{}
)
Expand All @@ -230,18 +178,59 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
return indexer, created, err
}

func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}

stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
}

fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}

id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}

// init init the indexer
func (b *BleveIndexer) init() (bool, error) {
var err error
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
if b.indexer != nil {
return false, nil
}

b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
Expand All @@ -262,38 +251,19 @@ func (b *BleveIndexer) Close() {
}

// Index indexes the data
func (b *BleveIndexer) Index(repoID int64) error {
repo, err := models.GetRepositoryByID(repoID)
if err != nil {
return err
}

sha, err := getDefaultBranchSha(repo)
if err != nil {
return err
}
changes, err := getRepoChanges(repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}

func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
if err := addUpdate(sha, update, repo, batch); err != nil {
if err := b.addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
for _, filename := range changes.RemovedFilenames {
if err := addDelete(filename, repo, batch); err != nil {
if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
if err = batch.Flush(); err != nil {
return err
}
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
return batch.Flush()
}

// Delete deletes indexes by ids
Expand Down
53 changes: 3 additions & 50 deletions modules/indexer/code/bleve_test.go
Expand Up @@ -6,21 +6,15 @@ package code

import (
"io/ioutil"
"path/filepath"
"testing"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"

"github.com/stretchr/testify/assert"
)

func TestMain(m *testing.M) {
models.MainTest(m, filepath.Join("..", "..", ".."))
}

func TestIndexAndSearch(t *testing.T) {
func TestBleveIndexAndSearch(t *testing.T) {
models.PrepareTestEnv(t)

dir, err := ioutil.TempDir("", "bleve.index")
Expand All @@ -31,56 +25,15 @@ func TestIndexAndSearch(t *testing.T) {
}
defer util.RemoveAll(dir)

setting.Indexer.RepoIndexerEnabled = true
idx, _, err := NewBleveIndexer(dir)
if err != nil {
assert.Fail(t, "Unable to create indexer Error: %v", err)
assert.Fail(t, "Unable to create bleve indexer Error: %v", err)
if idx != nil {
idx.Close()
}
return
}
defer idx.Close()

err = idx.Index(1)
assert.NoError(t, err)

var (
keywords = []struct {
Keyword string
IDs []int64
Langs int
}{
{
Keyword: "Description",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "repo1",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "non-exist",
IDs: []int64{},
Langs: 0,
},
}
)

for _, kw := range keywords {
total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)

assert.NotNil(t, langs)
assert.Len(t, langs, kw.Langs)

var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
}
assert.EqualValues(t, kw.IDs, ids)
}
testIndexer("beleve", t, idx)
}