diff --git a/README.md b/README.md index f973c18..56b0240 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,9 @@ tok.MinWordSize = 3 // set default stop words tok.DefaultStopWordFunc = IsGermanStopWord -reuslt := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a") +reuslt := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a") // custom stop words -reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool { +reuslt2 := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool { if val == "fussball" { return true } @@ -31,11 +31,20 @@ reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asom ``` # Benchmark Results -goos: darwin +goos: linux goarch: amd64 -cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz -|Benchmark|runs|time/op|B/op|allocs/op| -|---|---|---|---|---| -BenchmarkURLTokenizerV2-12|2026138|605.3 ns/op|256 B/op|1 allocs/op -BenchmarkURLTokenizerV2Fast-12|3609961|330.6 ns/op|256 B/op|1 allocs/op -BenchmarkURLTokenizerV1-12|1766235|676.3 ns/op|272 B/op|2 allocs/op \ No newline at end of file +pkg: github.com/emetriq/gourltokenizer/tokenizer +cpu: 11th Gen Intel(R) Core(TM) i5-11500H @ 2.90GHz +| Benchmark | runs | time/op | B/op | allocs/op | +|-----------------------------------|---------|-------------|----------|-------------| +| BenchmarkEscapedURLTokenizerV3-12 | 1000000 | 1080 ns/op | 496 B/op | 3 allocs/op | +| BenchmarkURLTokenizerV3-12 | 4751826 | 255.5 ns/op | 256 B/op | 1 allocs/op | +| BenchmarkURLTokenizerV3Fast-12 | 6231590 | 191.6 ns/op | 256 B/op | 1 allocs/op | +| BenchmarkEscapedURLTokenizerV2-12 | 1000000 | 1042 ns/op | 496 B/op | 3 allocs/op | +| BenchmarkURLTokenizerV2-12 | 3813273 | 484.2 ns/op | 256 B/op | 1 allocs/op | +| BenchmarkURLTokenizerV2Fast-12 | 5835351 | 199.6 ns/op | 256 B/op | 1 allocs/op | +| BenchmarkEscapedURLTokenizerV1-12 | 1942860 | 1084 ns/op | 496 B/op | 3 allocs/op | +| BenchmarkURLTokenizerV1-12 | 2495599 | 510.7 ns/op | 272 B/op | 2 allocs/op | +| BenchmarkTokenizerV1-12 | 9431893 | 122.9 ns/op | 256 B/op | 1 allocs/op | +| BenchmarkTokenizerV2-12 | 7669710 | 157.0 ns/op | 256 B/op | 1 allocs/op | +| BenchmarkTokenizerV3-12 | 8120326 | 158.3 ns/op | 256 B/op | 1 allocs/op | \ No newline at end of file diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go index 8f92098..68c20ee 100644 --- a/tokenizer/tokenizer.go +++ b/tokenizer/tokenizer.go @@ -36,7 +36,8 @@ func stringContainsByteChar(s string, r byte) bool { } // TokenizeV3 splits URL to host and path parts and tokenize path and host part -// all terms are returned in lower case +// all terms are returned in lower case. If numbers are within a word, the complete +// word is filtered out. func TokenizeV3(encodedURL string, stopwordfunc ...func(string) bool) []string { encodedURLLower := strings.ToLower(encodedURL) var result []string @@ -72,8 +73,8 @@ func TokenizeFastV3(encodedURL string, stopwordfunc ...func(string) bool) []stri return result } -//TokenizeV2 splits URL to host and path parts and tokenize path and host part -//all terms are returned in lower case +// TokenizeV2 splits URL to host and path parts and tokenize path and host part +// all terms are returned in lower case func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string { encodedURLLower := strings.ToLower(encodedURL) decodedURL, err := url.QueryUnescape(encodedURLLower) @@ -91,8 +92,8 @@ func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string { return result } -//TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding -//all terms are returned in lower case +// TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding +// all terms are returned in lower case func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []string { urlLower := strings.ToLower(encodedURL) result := tokenizeV2(urlLower) @@ -102,8 +103,8 @@ func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []stri return result } -//TokenizeURL splits URL to host and path parts and tokenize path part -//all terms are returned in lower case +// TokenizeURL splits URL to host and path parts and tokenize path part +// all terms are returned in lower case func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string { urlToParse := url if !strings.HasPrefix(url, "http") && !strings.HasPrefix(url, "mailto") { @@ -135,6 +136,7 @@ func tokenizeV3(str string) []string { start := -1 dotCounter := 0 isDotCountMode := true + isContainingNumber := false domainNameEndIndex := -1 domainNameStartIndex := startIndex var b byte @@ -149,12 +151,22 @@ func tokenizeV3(str string) []string { start = idx } if idx == lastIndex && ((lastIndex-start+1) >= MinWordSize || isDotCountMode) { - result = append(result, str[start:strLen]) + if !isContainingNumber { + result = append(result, str[start:strLen]) + } + isContainingNumber = false } + } else if b >= '0' && b <= '9' && !isDotCountMode { + isContainingNumber = true } else if ((idx-start) >= MinWordSize || isDotCountMode) && start > -1 { - result = append(result, str[start:idx]) + if !isContainingNumber { + result = append(result, str[start:idx]) + } + + isContainingNumber = false start = -1 } else { + isContainingNumber = false start = -1 } if b == '/' && isDotCountMode { diff --git a/tokenizer/tokenizer_test.go b/tokenizer/tokenizer_test.go index 0c9d2c8..7c97c7e 100644 --- a/tokenizer/tokenizer_test.go +++ b/tokenizer/tokenizer_test.go @@ -11,25 +11,25 @@ func init() { } func Test_tokenizeCorrectPath(t *testing.T) { path := "/some-thing/very/interesting?queryparam2=1&queryparam2=3" - result := tokenizeV2(path) + result := tokenizeV3(path) assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result) } func Test_tokenizePathWithDashes(t *testing.T) { path := "/some-thing/very/interesting" - result := tokenizeV2(path) + result := tokenizeV3(path) assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result) } func Test_tokenizePathWithDashes2(t *testing.T) { path := "/hsv-fussball" - result := tokenizeV2(path) + result := tokenizeV3(path) assert.ElementsMatch(t, []string{"hsv", "fussball"}, result) } func Test_tokenizeEmptyString(t *testing.T) { path := "" - result := tokenizeV2(path) + result := tokenizeV3(path) assert.ElementsMatch(t, []string{}, result) } @@ -47,36 +47,35 @@ func Test_filterStopWorlds(t *testing.T) { } func Test_URLTokenizer(t *testing.T) { - result := TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1") + result := TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1") assert.ElementsMatch(t, []string{"path", "sport", "hsv", "fussball", "example.com"}, result) } func Test_URLTokenizerOneWord(t *testing.T) { - result := TokenizeV2("http://example.com/sport") + result := TokenizeV3("http://example.com/sport") assert.ElementsMatch(t, []string{"example.com", "sport"}, result) } func Test_URLTokenizerOneWordMinSize(t *testing.T) { - result := TokenizeV2("http://www.test-page.de/aaa/bbb/bc/ccc") + result := TokenizeV3("http://www.test-page.de/aaa/bbb/bc/ccc") assert.ElementsMatch(t, []string{"www.test-page.de", "aaa", "bbb", "ccc"}, result) } func Test_URLTokenizerWithScapedChars(t *testing.T) { - result := TokenizeV2("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord) + result := TokenizeV3("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord) assert.ElementsMatch(t, []string{"emetriq", "com", "example.com"}, result) } func Test_URLTokenizerWithWrongEscapedChars(t *testing.T) { - result := TokenizeV2("http://example.com/%%ssomething/usefull") + result := TokenizeV3("http://example.com/%%ssomething/usefull") assert.Equal(t, []string{"ssomething", "usefull", "example.com"}, result) } func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) { DefaultStopWordFunc = IsGermanStopWord - result := TokenizeV2("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&_tf=Von %1$s") + result := TokenizeV3("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&_tf=Von %1$s") assert.Equal(t, []string{ "vermischtes", - "article", "marisa", "burger", "rosenheim", @@ -116,6 +115,18 @@ func Test_URLWithoutHTTPButWithPath(t *testing.T) { assert.ElementsMatch(t, []string{"sports", "www.ironsrc.com"}, result) } +func Test_SkipWordsWithNumbers(t *testing.T) { + result := TokenizeV3("https://www.autoscout24.at/angebote/seat-altea-xl-reference-1-4-tfsi-motorschaden-benzin-grau-b82ebced-cb95-4f49-8038-5eb1c098e652") + // no 'ebced' + assert.ElementsMatch(t, []string{"angebote", "seat", "altea", "reference", "tfsi", "motorschaden", "benzin", "grau", "www.autoscout24.at"}, result) + assert.NotContains(t, result, "ebced") + + result = TokenizeV3("https://www.coches.net/123nissan-interstar-25dci-120-pro-l2h2-3500-diesel-2009-en-barcelona-52386149-fuvivo.aspx1") + // no '123nissan', 'dci' and 'aspx1' + assert.ElementsMatch(t, []string{"interstar", "pro", "diesel", "barcelona", "fuvivo", "www.coches.net"}, result) + assert.NotContains(t, result, "dci") +} + func BenchmarkEscapedURLTokenizerV3(b *testing.B) { for n := 0; n < b.N; n++ { TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")