Skip to content

Commit

Permalink
add language aliases for broader support.
Browse files Browse the repository at this point in the history
Reuse know language stopwords with similar languages. If/when the
support for the languages is added the aliases are ignored.

Ref: #2601
  • Loading branch information
srfrog committed Sep 19, 2018
1 parent 66d9f94 commit 4636597
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 23 deletions.
2 changes: 1 addition & 1 deletion edgraph/server.go
Expand Up @@ -356,7 +356,7 @@ func (s *Server) Mutate(ctx context.Context, mu *api.Mutation) (resp *api.Assign
resp.Context, err = query.ApplyMutations(ctx, m)
if !mu.CommitNow {
if err == y.ErrConflict {
err = status.Errorf(codes.FailedPrecondition, err.Error())
err = status.Error(codes.FailedPrecondition, err.Error())
}
return resp, err
}
Expand Down
4 changes: 3 additions & 1 deletion posting/index.go
Expand Up @@ -16,6 +16,8 @@ import (
"time"

"golang.org/x/net/trace"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"github.com/dgraph-io/badger"

Expand Down Expand Up @@ -54,7 +56,7 @@ func indexTokens(attr, lang string, src types.Val) ([]string, error) {
if ok {
it = newTokenizer
} else {
return nil, x.Errorf("Tokenizer not available for language: %s", lang)
return nil, status.Errorf(codes.Internal, "Tokenizer not available for language: %s", lang)
}
}
if schemaType == types.StringID {
Expand Down
10 changes: 10 additions & 0 deletions posting/index_test.go
Expand Up @@ -100,6 +100,16 @@ func TestIndexingInvalidLang(t *testing.T) {
require.Error(t, err)
}

func TestIndexingAliasedLang(t *testing.T) {
schema.ParseBytes([]byte("name:string @index(fulltext) @lang ."), 1)

// there is no tokenizer for "xx" language
_, err := indexTokens("name", "es", types.Val{types.StringID, []byte("error")})
require.NoError(t, err)
_, err = indexTokens("name", "es-es", types.Val{types.StringID, []byte("error")})
require.NoError(t, err)
}

func addMutation(t *testing.T, l *List, edge *intern.DirectedEdge, op uint32,
startTs uint64, commitTs uint64, index bool) {
if op == Del {
Expand Down
52 changes: 31 additions & 21 deletions tok/fts.go
Expand Up @@ -8,6 +8,8 @@
package tok

import (
"strings"

"github.com/dgraph-io/dgraph/x"

"github.com/blevesearch/bleve/analysis/analyzer/custom"
Expand Down Expand Up @@ -48,10 +50,12 @@ func initFullTextTokenizers() {
continue
}

defineStemmer(lang)
defineStopWordsList(lang)
defineAnalyzer(lang)
registerTokenizer(&FullTextTokenizer{Lang: countryCode(lang)})
for _, cc := range countryCodes(lang) {
defineStemmer(cc, lang)
defineStopWordsList(cc, lang)
defineAnalyzer(cc, lang)
registerTokenizer(&FullTextTokenizer{Lang: cc})
}
}

for _, lang := range [...]string{"chinese", "japanese", "korean"} {
Expand All @@ -74,16 +78,16 @@ func defineNormalizer() {
x.Check(err)
}

func defineStemmer(lang string) {
_, err := bleveCache.DefineTokenFilter(stemmerName(countryCode(lang)), map[string]interface{}{
func defineStemmer(cc, lang string) {
_, err := bleveCache.DefineTokenFilter(stemmerName(cc), map[string]interface{}{
"type": stemmer.Name,
"lang": lang,
})
x.Check(err)
}

func defineStopWordsList(lang string) {
name := stopWordsListName(countryCode(lang))
func defineStopWordsList(cc, lang string) {
name := stopWordsListName(cc)
_, err := bleveCache.DefineTokenMap(name, map[string]interface{}{
"type": tokenmap.Name,
"tokens": stopwords[lang],
Expand Down Expand Up @@ -122,16 +126,15 @@ func defineDefaultFullTextAnalyzer() {
}

// full text search analyzer - does language-specific stop-words removal and stemming
func defineAnalyzer(lang string) {
ln := countryCode(lang)
_, err := bleveCache.DefineAnalyzer(FtsTokenizerName(ln), map[string]interface{}{
func defineAnalyzer(cc, lang string) {
_, err := bleveCache.DefineAnalyzer(FtsTokenizerName(cc), map[string]interface{}{
"type": custom.Name,
"tokenizer": unicode.Name,
"token_filters": []string{
lowercase.Name,
normalizerName,
stopWordsListName(ln),
stemmerName(ln),
stopWordsListName(cc),
stemmerName(cc),
},
})
x.Check(err)
Expand Down Expand Up @@ -166,18 +169,25 @@ func stopWordsListName(lang string) string {
}

func countryCode(lang string) string {
code, ok := langToCode[lang]
codes := countryCodes(lang)
return codes[0]
}

func countryCodes(lang string) []string {
codes, ok := langToCode[lang]
x.AssertTruef(ok, "Unsupported language: %s", lang)
return code
return strings.Split(codes, ",")
}

func init() {
// List based on https://godoc.org/golang.org/x/text/language#Tag
// It contains more languages than supported by Bleve, to enable seamless addition of new langs.
// Issue#2601: added aliasing of related languages to broaden support. When those langs are added
// the aliases won't matter.
langToCode = map[string]string{
"afrikaans": "af",
"amharic": "am",
"arabic": "ar",
"arabic": "ar,ar-001",
"modernstandardarabic": "ar-001",
"azerbaijani": "az",
"bulgarian": "bg",
Expand All @@ -187,17 +197,17 @@ func init() {
"danish": "da",
"german": "de",
"greek": "el",
"english": "en",
"english": "en,en-us,en-gb",
"americanenglish": "en-us",
"britishenglish": "en-gb",
"spanish": "es",
"spanish": "es,es-es,es-419",
"europeanspanish": "es-es",
"latinamericanspanish": "es-419",
"estonian": "et",
"persian": "fa",
"finnish": "fi",
"filipino": "fil",
"french": "fr",
"french": "fr,fr-ca",
"canadianfrench": "fr-ca",
"gujarati": "gu",
"hebrew": "he",
Expand Down Expand Up @@ -229,7 +239,7 @@ func init() {
"norwegian": "no",
"punjabi": "pa",
"polish": "pl",
"portuguese": "pt",
"portuguese": "pt,pt-br,pt-pt",
"brazilianportuguese": "pt-br",
"europeanportuguese": "pt-pt",
"romanian": "ro",
Expand All @@ -238,7 +248,7 @@ func init() {
"slovak": "sk",
"slovenian": "sl",
"albanian": "sq",
"serbian": "sr",
"serbian": "sr,sr-latn",
"serbianlatin": "sr-latn",
"swedish": "sv",
"swahili": "sw",
Expand Down

0 comments on commit 4636597

Please sign in to comment.