Skip to content

Commit

Permalink
remove double results (fix #19)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed May 12, 2022
1 parent 08d8dcd commit 11eb7ec
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 10 deletions.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
.idea
t
*.json
*.tsv
*.csv
*t.json
*t.tsv
*t.csv
prof
.vscode
200k-lines.txt
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## Unreleased

- Fix [#19]: duplication of results of fuzzy matching.

## [v0.2.1] - 2022-05-11 Wed

- Add [#23]: score details in results.
Expand Down
13 changes: 11 additions & 2 deletions ent/fuzzy/fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,18 @@ func (f *fuzzy) FindFuzzy(stem string) []string {

func (f *fuzzy) find(stem string, maxDist int) []string {
stems := f.trie.FuzzyMatches(stem, maxDist)
res := make([]string, 0, len(stems)*2)
resMap := make(map[string]struct{})
for i := range stems {
res = append(res, f.canonicals[stems[i]]...)
cs := f.canonicals[stems[i]]
for i := range cs {
resMap[cs[i]] = struct{}{}
}
}
res := make([]string, len(resMap))
var i int
for k := range resMap {
res[i] = k
i++
}
return res
}
26 changes: 26 additions & 0 deletions gndiff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,32 @@ func TestScore(t *testing.T) {
assert.Equal("Abelia forrestii var. gracilenta (W.W.Sm.) Landrein", abelia.ReferenceRecords[0].Name)
}

// Issue #19: duplicated results for similar names
func TestNoDuplicates(t *testing.T) {
assert := assert.New(t)
cfg := config.New()
ing := ingestio.New(cfg)

src := filepath.Join(path, "issue-19-src.csv")
recSrc, err := ing.Records(src)
assert.Nil(err)

ref := filepath.Join(path, "issue-19-ref.csv")
recRef, err := ing.Records(ref)
assert.Nil(err)

gnd := gndiff.New(cfg)
res, err := gnd.Compare(recSrc, recRef)
assert.Nil(err)
assert.Equal(len(recSrc), len(res.Matches))

rrs := res.Matches[0].ReferenceRecords
assert.Equal(2, len(rrs))
assert.Equal("Obione maritima (Alfredo) Pacino var. maritima", rrs[0].Name)
assert.Equal("Obione maritima (Alfredo) Pacino subsp. maritima", rrs[1].Name)

}

func TestGNdiff(t *testing.T) {
cfg := config.New()
ing := ingestio.New(cfg)
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ require (
github.com/devopsfaith/bloomfilter v1.4.0
github.com/dgraph-io/badger/v2 v2.2007.4
github.com/dvirsky/levenshtein v0.0.0-20200624034316-59b26b61c3c8
github.com/gnames/gnames v1.0.0-RC1
github.com/gnames/gnfmt v0.2.0
github.com/gnames/gnames v1.0.0-RC2
github.com/gnames/gnfmt v0.3.0
github.com/gnames/gnlib v0.14.0
github.com/gnames/gnparser v1.6.5
github.com/gnames/gnsys v0.2.2
Expand Down
7 changes: 4 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,12 @@ github.com/georgysavva/scany v0.3.0/go.mod h1:q8QyrfXjmBk9iJD00igd4lbkAKEXAH/zIY
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/gin-contrib/sse v0.0.0-20170109093832-22d885f9ecc7/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s=
github.com/gin-gonic/gin v1.1.5-0.20170702092826-d459835d2b07/go.mod h1:7cKuhb5qV2ggCFctp2fJQ+ErvciLZrIeoOSOm6mUr7Y=
github.com/gnames/gnames v1.0.0-RC1 h1:lf/+5rBUczUYGDo9LBgwT9Xykkc0hdQqlvldlaKrHTo=
github.com/gnames/gnames v1.0.0-RC1/go.mod h1:RT99vAA83z/qxuVp9rH13YWtZk4hOIZ1N9fo5vWioYI=
github.com/gnames/gnames v1.0.0-RC2 h1:y6fc8ROHnOTN0zXPnkjvdCEhiBC0zvIFz67vEbUHCTk=
github.com/gnames/gnames v1.0.0-RC2/go.mod h1:RT99vAA83z/qxuVp9rH13YWtZk4hOIZ1N9fo5vWioYI=
github.com/gnames/gnfmt v0.1.0/go.mod h1:WG9c3CoiVrGc1SDsxLk7zjmv2B4UIzI00m4K5Khc/d0=
github.com/gnames/gnfmt v0.2.0 h1:CjE1HxdqyTwufua5wMCdILWnCsCfRiHe5G4TgxR8aAI=
github.com/gnames/gnfmt v0.2.0/go.mod h1:0Aog37s1ZNpmUwVQOf+lnx0SQq8r2EvfE/pLYGiJlJQ=
github.com/gnames/gnfmt v0.3.0 h1:MvHFDXhzeDLt2B3R76Dwg89YCSNPD4PNDTWhF5Anqy4=
github.com/gnames/gnfmt v0.3.0/go.mod h1:0Aog37s1ZNpmUwVQOf+lnx0SQq8r2EvfE/pLYGiJlJQ=
github.com/gnames/gnlib v0.0.3/go.mod h1:k7Wsdx6zTzyhbxH2JcoCVFvgOQzA4iwehRxa8eEOfF8=
github.com/gnames/gnlib v0.3.2/go.mod h1:IIo4lQ8hmW/pCmudLgwJ4KdoKWF6B4jFCdVcowG/evw=
github.com/gnames/gnlib v0.6.6/go.mod h1:DgK8NcrG2YhOS1Nx0cxt8+Gdwu1cw/abSZdhkQKjIH8=
Expand Down
3 changes: 3 additions & 0 deletions testdata/issue-19-ref.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
TaxonID,Family,ScientificName
1001,,Obione maritima (Alfredo) Pacino var. maritima
1002,,Obione maritima (Alfredo) Pacino subsp. maritima
2 changes: 2 additions & 0 deletions testdata/issue-19-src.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ScientificName
Obione maritima (Alfredo) Pacino var. maritimaa

0 comments on commit 11eb7ec

Please sign in to comment.