Skip to content

Commit

Permalink
Support elastic search for code search (#10273)
Browse files Browse the repository at this point in the history
* Support elastic search for code search

* Finished elastic search implementation and add some tests

* Enable test on drone and added docs

* Add new fields to elastic search

* Fix bug

* remove unused changes

* Use indexer alias to keep the gitea indexer version

* Improve codes

* Some code improvements

* The real indexer name changed to xxx.v1

Co-authored-by: zeripath <art27@cantab.net>
  • Loading branch information
lunny and zeripath committed Aug 30, 2020
1 parent d257485 commit 9bc69ff
Show file tree
Hide file tree
Showing 14 changed files with 694 additions and 164 deletions.
1 change: 1 addition & 0 deletions .drone.yml
Expand Up @@ -209,6 +209,7 @@ steps:
TAGS: bindata
TEST_LDAP: 1
USE_REPO_TEST_DIR: 1
TEST_INDEXER_CODE_ES_URL: "http://elastic:changeme@elasticsearch:9200"
depends_on:
- build

Expand Down
8 changes: 8 additions & 0 deletions custom/conf/app.example.ini
Expand Up @@ -428,7 +428,15 @@ STARTUP_TIMEOUT=30s

; repo indexer by default disabled, since it uses a lot of disk space
REPO_INDEXER_ENABLED = false
; Code search engine type, could be `bleve` or `elasticsearch`.
REPO_INDEXER_TYPE = bleve
; Index file used for code search.
REPO_INDEXER_PATH = indexers/repos.bleve
; Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
REPO_INDEXER_CONN_STR =
; Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch
REPO_INDEXER_NAME = gitea_codes

UPDATE_BUFFER_LEN = 20
MAX_FILE_SIZE = 1048576
; A comma separated list of glob patterns (see https://github.com/gobwas/glob) to include
Expand Down
4 changes: 4 additions & 0 deletions docs/content/doc/advanced/config-cheat-sheet.en-us.md
Expand Up @@ -270,7 +270,11 @@ relation to port exhaustion.
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: Batch queue number.

- `REPO_INDEXER_ENABLED`: **false**: Enables code search (uses a lot of disk space, about 6 times more than the repository size).
- `REPO_INDEXER_TYPE`: **bleve**: Code search engine type, could be `bleve` or `elasticsearch`.
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
- `REPO_INDEXER_CONN_STR`: ****: Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200
- `REPO_INDEXER_NAME`: **gitea_codes**: Code indexer name, available when `REPO_INDEXER_TYPE` is elasticsearch

- `REPO_INDEXER_INCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **include** in the index. Use `**.txt` to match any files with .txt extension. An empty list means include all files.
- `REPO_INDEXER_EXCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **exclude** from the index. Files that match this list will not be indexed, even if they match in `REPO_INDEXER_INCLUDE`.
- `REPO_INDEXER_EXCLUDE_VENDORED`: **true**: Exclude vendored files from index.
Expand Down
6 changes: 5 additions & 1 deletion docs/content/doc/advanced/config-cheat-sheet.zh-cn.md
Expand Up @@ -98,8 +98,12 @@ menu:
- `ISSUE_INDEXER_QUEUE_CONN_STR`: **addrs=127.0.0.1:6379 db=0**: 当 `ISSUE_INDEXER_QUEUE_TYPE``redis` 时,保存Redis队列的连接字符串。
- `ISSUE_INDEXER_QUEUE_BATCH_NUMBER`: **20**: 队列处理中批量提交数量。

- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间)。
- `REPO_INDEXER_ENABLED`: **false**: 是否启用代码搜索(启用后会占用比较大的磁盘空间,如果是bleve可能需要占用约6倍存储空间)。
- `REPO_INDEXER_TYPE`: **bleve**: 代码搜索引擎类型,可以为 `bleve` 或者 `elasticsearch`
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: 用于代码搜索的索引文件路径。
- `REPO_INDEXER_CONN_STR`: ****: 代码搜索引擎连接字符串,当 `REPO_INDEXER_TYPE``elasticsearch` 时有效。例如: http://elastic:changeme@localhost:9200
- `REPO_INDEXER_NAME`: **gitea_codes**: 代码搜索引擎的名字,当 `REPO_INDEXER_TYPE``elasticsearch` 时有效。

- `UPDATE_BUFFER_LEN`: **20**: 代码索引请求的缓冲区长度。
- `MAX_FILE_SIZE`: **1048576**: 进行解析的源代码文件的最大长度,小于该值时才会索引。

Expand Down
132 changes: 51 additions & 81 deletions modules/indexer/code/bleve.go
Expand Up @@ -58,10 +58,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}

// openIndexer open the index at the specified path, checking for metadata
// openBleveIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, nil
Expand Down Expand Up @@ -104,54 +104,14 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}

func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return addDelete(update.Filename, repo, batch)
}

fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}

id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}

const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 5
)

// createRepoIndexer create a repo indexer if one does not already exist
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
// createBleveIndexer create a bleve repo indexer if one does not already exist
func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
Expand Down Expand Up @@ -199,18 +159,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
return indexer, nil
}

func filenameIndexerID(repoID int64, filename string) string {
return indexerID(repoID) + "_" + filename
}

func filenameOfIndexerID(indexerID string) string {
index := strings.IndexByte(indexerID, '_')
if index == -1 {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
return indexerID[index+1:]
}

var (
_ Indexer = &BleveIndexer{}
)
Expand All @@ -230,18 +178,59 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
return indexer, created, err
}

func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
return nil
}

stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
}

fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}

id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
id := filenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}

// init init the indexer
func (b *BleveIndexer) init() (bool, error) {
var err error
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
if b.indexer != nil {
return false, nil
}

b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
Expand All @@ -262,38 +251,19 @@ func (b *BleveIndexer) Close() {
}

// Index indexes the data
func (b *BleveIndexer) Index(repoID int64) error {
repo, err := models.GetRepositoryByID(repoID)
if err != nil {
return err
}

sha, err := getDefaultBranchSha(repo)
if err != nil {
return err
}
changes, err := getRepoChanges(repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}

func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
if err := addUpdate(sha, update, repo, batch); err != nil {
if err := b.addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
for _, filename := range changes.RemovedFilenames {
if err := addDelete(filename, repo, batch); err != nil {
if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
if err = batch.Flush(); err != nil {
return err
}
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
return batch.Flush()
}

// Delete deletes indexes by ids
Expand Down
53 changes: 3 additions & 50 deletions modules/indexer/code/bleve_test.go
Expand Up @@ -6,21 +6,15 @@ package code

import (
"io/ioutil"
"path/filepath"
"testing"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"

"github.com/stretchr/testify/assert"
)

func TestMain(m *testing.M) {
models.MainTest(m, filepath.Join("..", "..", ".."))
}

func TestIndexAndSearch(t *testing.T) {
func TestBleveIndexAndSearch(t *testing.T) {
models.PrepareTestEnv(t)

dir, err := ioutil.TempDir("", "bleve.index")
Expand All @@ -31,56 +25,15 @@ func TestIndexAndSearch(t *testing.T) {
}
defer util.RemoveAll(dir)

setting.Indexer.RepoIndexerEnabled = true
idx, _, err := NewBleveIndexer(dir)
if err != nil {
assert.Fail(t, "Unable to create indexer Error: %v", err)
assert.Fail(t, "Unable to create bleve indexer Error: %v", err)
if idx != nil {
idx.Close()
}
return
}
defer idx.Close()

err = idx.Index(1)
assert.NoError(t, err)

var (
keywords = []struct {
Keyword string
IDs []int64
Langs int
}{
{
Keyword: "Description",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "repo1",
IDs: []int64{1},
Langs: 1,
},
{
Keyword: "non-exist",
IDs: []int64{},
Langs: 0,
},
}
)

for _, kw := range keywords {
total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)

assert.NotNil(t, langs)
assert.Len(t, langs, kw.Langs)

var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
}
assert.EqualValues(t, kw.IDs, ids)
}
testIndexer("beleve", t, idx)
}

0 comments on commit 9bc69ff

Please sign in to comment.