Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
internal/fuzzy: improvements to the symbol scoring algorithm
Based on feedback in golang/go#60027, tweak the fuzzy symbol scoring
algorithm to much more strongly prefer sequential and exact matches.

Fixes golang/go#60027

Change-Id: I1c6d019065c4dff4adf2db9e94397a635e13d50f
Reviewed-on: https://go-review.googlesource.com/c/tools/+/493623
gopls-CI: kokoro <noreply+kokoro@google.com>
Run-TryBot: Robert Findley <rfindley@google.com>
Reviewed-by: Paul Jolly <paul@myitcv.org.uk>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Alan Donovan <adonovan@google.com>
  • Loading branch information
findleyr committed May 9, 2023
1 parent 3449242 commit ddfa220
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 26 deletions.
4 changes: 2 additions & 2 deletions gopls/internal/lsp/cmd/usage/workspace_symbol.hlp
Expand Up @@ -9,5 +9,5 @@ Example:

workspace_symbol-flags:
-matcher=string
specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.
The default is caseInsensitive.
specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.
The default is caseinsensitive.
7 changes: 4 additions & 3 deletions gopls/internal/lsp/cmd/workspace_symbol.go
Expand Up @@ -8,6 +8,7 @@ import (
"context"
"flag"
"fmt"
"strings"

"golang.org/x/tools/gopls/internal/lsp/protocol"
"golang.org/x/tools/gopls/internal/lsp/source"
Expand All @@ -16,7 +17,7 @@ import (

// workspaceSymbol implements the workspace_symbol verb for gopls.
type workspaceSymbol struct {
Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.\nThe default is caseInsensitive."`
Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.\nThe default is caseinsensitive."`

app *Application
}
Expand Down Expand Up @@ -46,10 +47,10 @@ func (r *workspaceSymbol) Run(ctx context.Context, args ...string) error {
if opts != nil {
opts(o)
}
switch r.Matcher {
switch strings.ToLower(r.Matcher) {
case "fuzzy":
o.SymbolMatcher = source.SymbolFuzzy
case "caseSensitive":
case "casesensitive":
o.SymbolMatcher = source.SymbolCaseSensitive
case "fastfuzzy":
o.SymbolMatcher = source.SymbolFastFuzzy
Expand Down
5 changes: 4 additions & 1 deletion gopls/internal/lsp/source/workspace_symbol.go
Expand Up @@ -484,7 +484,10 @@ func matchFile(store *symbolStore, symbolizer symbolizer, matcher matcherFunc, r
// every field or method nesting level to access the field decreases
// the score by a factor of 1.0 - depth*depthFactor, up to a depth of
// 3.
depthFactor = 0.2
//
// Use a small constant here, as this exists mostly to break ties
// (e.g. given a type Foo and a field x.Foo, prefer Foo).
depthFactor = 0.01
)

startWord := true
Expand Down
44 changes: 26 additions & 18 deletions internal/fuzzy/symbol.go
Expand Up @@ -26,9 +26,6 @@ import (
// symbol or identifiers, so doing this avoids allocating strings.
// - We can return the index of the right-most match, allowing us to trim
// irrelevant qualification.
//
// This implementation is experimental, serving as a reference fast algorithm
// to compare to the fuzzy algorithm implemented by Matcher.
type SymbolMatcher struct {
// Using buffers of length 256 is both a reasonable size for most qualified
// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
Expand Down Expand Up @@ -169,19 +166,29 @@ input:
// Score is the average score for each character.
//
// A character score is the multiple of:
// 1. 1.0 if the character starts a segment, .8 if the character start a
// mid-segment word, otherwise 0.6. This carries over to immediately
// following characters.
// 2. For the final character match, the multiplier from (1) is reduced to
// .8 if the next character in the input is a mid-segment word, or 0.6 if
// the next character in the input is not a word or segment start. This
// ensures that we favor whole-word or whole-segment matches over prefix
// matches.
// 3. 1.0 if the character is part of the last segment, otherwise
// 1.0-.2*<segments from the right>, with a max segment count of 3.
// 1. 1.0 if the character starts a segment or is preceded by a matching
// character, 0.9 if the character starts a mid-segment word, else 0.6.
//
// Note that characters preceded by a matching character get the max
// score of 1.0 so that sequential or exact matches are preferred, even
// if they don't start/end at a segment or word boundary. For example, a
// match for "func" in intfuncs should have a higher score than in
// ifunmatched.
//
// For the final character match, the multiplier from (1) is reduced to
// 0.9 if the next character in the input is a mid-segment word, or 0.6
// if the next character in the input is not a word or segment start.
// This ensures that we favor whole-word or whole-segment matches over
// prefix matches.
//
// 2. 1.0 if the character is part of the last segment, otherwise
// 1.0-0.1*<segments from the right>, with a max segment count of 3.
// Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
// early in a qualified symbol name) still scores higher than _f_o_o_
// (a completely split match).
//
// This is a very naive algorithm, but it is fast. There's lots of prior art
// here, and we should leverage it. For example, we could explicitly consider
// This is a naive algorithm, but it is fast. There's lots of prior art here
// that could be leveraged. For example, we could explicitly consider
// character distance, and exact matches of words or segments.
//
// Also note that this might not actually find the highest scoring match, as
Expand All @@ -192,10 +199,10 @@ input:
p = m.pattern[pi]

const (
segStreak = 1.0
wordStreak = 0.8
segStreak = 1.0 // start of segment or sequential match
wordStreak = 0.9 // start of word match
noStreak = 0.6
perSegment = 0.2 // we count at most 3 segments above
perSegment = 0.1 // we count at most 3 segments above
)

streakBonus := noStreak
Expand Down Expand Up @@ -228,6 +235,7 @@ input:
if finalChar {
break
}
streakBonus = segStreak // see above: sequential characters get the max score
} else {
streakBonus = noStreak
}
Expand Down
31 changes: 29 additions & 2 deletions internal/fuzzy/symbol_test.go
Expand Up @@ -40,12 +40,12 @@ func TestSymbolRanking(t *testing.T) {
symbols := []string{
"this.is.better.than.most",
"test.foo.bar",
"atest",
"thebest",
"test.foo",
"test.foo",
"tTest",
"atest",
"testage",
"tTest",
"foo.test",
"test",
}
Expand All @@ -60,6 +60,33 @@ func TestSymbolRanking(t *testing.T) {
}
}

// Test that we strongly prefer exact matches.
//
// In golang/go#60027, we preferred "Runner" for the query "rune" over several
// results containing the word "rune" exactly. Following this observation,
// scoring was tweaked to more strongly emphasize sequential characters and
// exact matches.
func TestSymbolRanking_Issue60027(t *testing.T) {
matcher := NewSymbolMatcher("rune")

// symbols to match, in ascending order of ranking.
symbols := []string{
"Runner",
"singleRuneParam",
"Config.ifsRune",
"Parser.rune",
}
prev := 0.0
for _, sym := range symbols {
_, score := matcher.Match([]string{sym})
t.Logf("Match(%q) = %v", sym, score)
if score < prev {
t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
}
prev = score
}
}

func TestChunkedMatch(t *testing.T) {
matcher := NewSymbolMatcher("test")

Expand Down

0 comments on commit ddfa220

Please sign in to comment.