diff --git a/.gitignore b/.gitignore index 2122739..3d40f72 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ .idea t -*.json -*.tsv -*.csv +*t.json +*t.tsv +*t.csv prof .vscode 200k-lines.txt diff --git a/ent/matcher/match.go b/ent/matcher/match.go index 6aba0b0..2be4382 100644 --- a/ent/matcher/match.go +++ b/ent/matcher/match.go @@ -1,6 +1,7 @@ package matcher import ( + "fmt" "strings" "github.com/gnames/gndiff/ent/record" @@ -14,6 +15,7 @@ func (m *matcher) Match(rec record.Record) ([]record.Record, error) { } res, err = m.MatchFuzzy(rec.Canonical.Simple, rec.Canonical.Stemmed) + fmt.Printf("FUZZY: %#v\n\n", res) if len(res) > 0 || err != nil { return res, err } diff --git a/gndiff_test.go b/gndiff_test.go index a1dbec7..bf414d5 100644 --- a/gndiff_test.go +++ b/gndiff_test.go @@ -1,6 +1,7 @@ package gndiff_test import ( + "fmt" "path/filepath" "testing" @@ -40,6 +41,35 @@ func TestScore(t *testing.T) { assert.Equal("Abelia forrestii var. gracilenta (W.W.Sm.) Landrein", abelia.ReferenceRecords[0].Name) } +// Issue #19: duplicated results for similar names +func TestNoDuplicates(t *testing.T) { + assert := assert.New(t) + cfg := config.New() + ing := ingestio.New(cfg) + + src := filepath.Join(path, "issue-19-src.csv") + recSrc, err := ing.Records(src) + assert.Nil(err) + + ref := filepath.Join(path, "issue-19-ref.csv") + recRef, err := ing.Records(ref) + assert.Nil(err) + + gnd := gndiff.New(cfg) + res, err := gnd.Compare(recSrc, recRef) + assert.Nil(err) + fmt.Printf("SRC: %#v\n\n", len(recSrc)) + for _, v := range res.Matches[0].ReferenceRecords { + fmt.Printf("RES: %#v\n\n", v) + } + assert.Equal(len(recSrc), len(res.Matches)) + + obione := res.Matches[0] + abelia := res.Matches[1] + bubo := res.Matches[2] + _, _, _ = obione, abelia, bubo +} + func TestGNdiff(t *testing.T) { cfg := config.New() ing := ingestio.New(cfg) diff --git a/go.mod b/go.mod index 2ecff7c..154b8a8 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/devopsfaith/bloomfilter v1.4.0 github.com/dgraph-io/badger/v2 v2.2007.4 github.com/dvirsky/levenshtein v0.0.0-20200624034316-59b26b61c3c8 - github.com/gnames/gnames v1.0.0-RC1 + github.com/gnames/gnames v1.0.0-RC2 github.com/gnames/gnfmt v0.2.0 github.com/gnames/gnlib v0.14.0 github.com/gnames/gnparser v1.6.5 diff --git a/go.sum b/go.sum index db02cf8..8d54db7 100644 --- a/go.sum +++ b/go.sum @@ -202,8 +202,8 @@ github.com/georgysavva/scany v0.3.0/go.mod h1:q8QyrfXjmBk9iJD00igd4lbkAKEXAH/zIY github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gin-contrib/sse v0.0.0-20170109093832-22d885f9ecc7/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= github.com/gin-gonic/gin v1.1.5-0.20170702092826-d459835d2b07/go.mod h1:7cKuhb5qV2ggCFctp2fJQ+ErvciLZrIeoOSOm6mUr7Y= -github.com/gnames/gnames v1.0.0-RC1 h1:lf/+5rBUczUYGDo9LBgwT9Xykkc0hdQqlvldlaKrHTo= -github.com/gnames/gnames v1.0.0-RC1/go.mod h1:RT99vAA83z/qxuVp9rH13YWtZk4hOIZ1N9fo5vWioYI= +github.com/gnames/gnames v1.0.0-RC2 h1:y6fc8ROHnOTN0zXPnkjvdCEhiBC0zvIFz67vEbUHCTk= +github.com/gnames/gnames v1.0.0-RC2/go.mod h1:RT99vAA83z/qxuVp9rH13YWtZk4hOIZ1N9fo5vWioYI= github.com/gnames/gnfmt v0.1.0/go.mod h1:WG9c3CoiVrGc1SDsxLk7zjmv2B4UIzI00m4K5Khc/d0= github.com/gnames/gnfmt v0.2.0 h1:CjE1HxdqyTwufua5wMCdILWnCsCfRiHe5G4TgxR8aAI= github.com/gnames/gnfmt v0.2.0/go.mod h1:0Aog37s1ZNpmUwVQOf+lnx0SQq8r2EvfE/pLYGiJlJQ= diff --git a/testdata/issue-19-ref.csv b/testdata/issue-19-ref.csv new file mode 100644 index 0000000..1f36190 --- /dev/null +++ b/testdata/issue-19-ref.csv @@ -0,0 +1,3 @@ +TaxonID,Family,ScientificName +1001,,Obione maritima (Alfredo) Pacino var. maritima +1002,,Obione maritima (Alfredo) Pacino subsp. maritima \ No newline at end of file diff --git a/testdata/issue-19-src.csv b/testdata/issue-19-src.csv new file mode 100644 index 0000000..13954e7 --- /dev/null +++ b/testdata/issue-19-src.csv @@ -0,0 +1,4 @@ +ScientificName +Obione maritima (Alfredo) Pacino var. maritimaa +Obione maritima (Alfredo) Di'Stefano subsp. maritima +Quercus lamarkensis \ No newline at end of file