skip words with numbers

emetriq · May 10, 2023 · 89b982a · 89b982a
1 parent 9f95bd8
commit 89b982a
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -17,9 +17,9 @@ tok.MinWordSize = 3
 // set default stop words
 tok.DefaultStopWordFunc = IsGermanStopWord
 
-reuslt := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
+reuslt := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
 // custom stop words
-reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
+reuslt2 := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
 	if val == "fussball" {
 		return true
 	}
@@ -31,11 +31,20 @@ reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asom
 ```
 # Benchmark Results
 
-goos: darwin
+goos: linux
 goarch: amd64
-cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-|Benchmark|runs|time/op|B/op|allocs/op|
-|---|---|---|---|---|
-BenchmarkURLTokenizerV2-12|2026138|605.3 ns/op|256 B/op|1 allocs/op
-BenchmarkURLTokenizerV2Fast-12|3609961|330.6 ns/op|256 B/op|1 allocs/op
-BenchmarkURLTokenizerV1-12|1766235|676.3 ns/op|272 B/op|2 allocs/op
+pkg: github.com/emetriq/gourltokenizer/tokenizer
+cpu: 11th Gen Intel(R) Core(TM) i5-11500H @ 2.90GHz
+| Benchmark                         | runs    | time/op     | B/op     | allocs/op   |
+|-----------------------------------|---------|-------------|----------|-------------|
+| BenchmarkEscapedURLTokenizerV3-12 | 1000000 | 1080 ns/op  | 496 B/op | 3 allocs/op |
+| BenchmarkURLTokenizerV3-12        | 4751826 | 255.5 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkURLTokenizerV3Fast-12    | 6231590 | 191.6 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkEscapedURLTokenizerV2-12 | 1000000 | 1042 ns/op  | 496 B/op | 3 allocs/op |
+| BenchmarkURLTokenizerV2-12        | 3813273 | 484.2 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkURLTokenizerV2Fast-12    | 5835351 | 199.6 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkEscapedURLTokenizerV1-12 | 1942860 | 1084 ns/op  | 496 B/op | 3 allocs/op |
+| BenchmarkURLTokenizerV1-12        | 2495599 | 510.7 ns/op | 272 B/op | 2 allocs/op |
+| BenchmarkTokenizerV1-12           | 9431893 | 122.9 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkTokenizerV2-12           | 7669710 | 157.0 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkTokenizerV3-12           | 8120326 | 158.3 ns/op | 256 B/op | 1 allocs/op |
diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go
@@ -36,7 +36,8 @@ func stringContainsByteChar(s string, r byte) bool {
 }
 
 // TokenizeV3 splits URL to host and path parts and tokenize path and host part
-// all terms are returned in lower case
+// all terms are returned in lower case. If numbers are within a word, the complete
+// word is filtered out.
 func TokenizeV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
 	encodedURLLower := strings.ToLower(encodedURL)
 	var result []string
@@ -72,8 +73,8 @@ func TokenizeFastV3(encodedURL string, stopwordfunc ...func(string) bool) []stri
 	return result
 }
 
-//TokenizeV2 splits URL to host and path parts and tokenize path and host part
-//all terms are returned in lower case
+// TokenizeV2 splits URL to host and path parts and tokenize path and host part
+// all terms are returned in lower case
 func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
 	encodedURLLower := strings.ToLower(encodedURL)
 	decodedURL, err := url.QueryUnescape(encodedURLLower)
@@ -91,8 +92,8 @@ func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
 	return result
 }
 
-//TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
-//all terms are returned in lower case
+// TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
+// all terms are returned in lower case
 func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
 	urlLower := strings.ToLower(encodedURL)
 	result := tokenizeV2(urlLower)
@@ -102,8 +103,8 @@ func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []stri
 	return result
 }
 
-//TokenizeURL splits URL to host and path parts and tokenize path part
-//all terms are returned in lower case
+// TokenizeURL splits URL to host and path parts and tokenize path part
+// all terms are returned in lower case
 func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
 	urlToParse := url
 	if !strings.HasPrefix(url, "http") && !strings.HasPrefix(url, "mailto") {
@@ -135,6 +136,7 @@ func tokenizeV3(str string) []string {
 	start := -1
 	dotCounter := 0
 	isDotCountMode := true
+	isContainingNumber := false
 	domainNameEndIndex := -1
 	domainNameStartIndex := startIndex
 	var b byte
@@ -149,12 +151,22 @@ func tokenizeV3(str string) []string {
 				start = idx
 			}
 			if idx == lastIndex && ((lastIndex-start+1) >= MinWordSize || isDotCountMode) {
-				result = append(result, str[start:strLen])
+				if !isContainingNumber {
+					result = append(result, str[start:strLen])
+				}
+				isContainingNumber = false
 			}
+		} else if b >= '0' && b <= '9' && !isDotCountMode {
+			isContainingNumber = true
 		} else if ((idx-start) >= MinWordSize || isDotCountMode) && start > -1 {
-			result = append(result, str[start:idx])
+			if !isContainingNumber {
+				result = append(result, str[start:idx])
+			}
+
+			isContainingNumber = false
 			start = -1
 		} else {
+			isContainingNumber = false
 			start = -1
 		}
 		if b == '/' && isDotCountMode {

diff --git a/tokenizer/tokenizer_test.go b/tokenizer/tokenizer_test.go
@@ -11,25 +11,25 @@ func init() {
 }
 func Test_tokenizeCorrectPath(t *testing.T) {
 	path := "/some-thing/very/interesting?queryparam2=1&queryparam2=3"
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
 }
 
 func Test_tokenizePathWithDashes(t *testing.T) {
 	path := "/some-thing/very/interesting"
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
 }
 
 func Test_tokenizePathWithDashes2(t *testing.T) {
 	path := "/hsv-fussball"
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{"hsv", "fussball"}, result)
 }
 
 func Test_tokenizeEmptyString(t *testing.T) {
 	path := ""
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{}, result)
 }
 
@@ -47,36 +47,35 @@ func Test_filterStopWorlds(t *testing.T) {
 }
 
 func Test_URLTokenizer(t *testing.T) {
-	result := TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
+	result := TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
 	assert.ElementsMatch(t, []string{"path", "sport", "hsv", "fussball", "example.com"}, result)
 }
 
 func Test_URLTokenizerOneWord(t *testing.T) {
-	result := TokenizeV2("http://example.com/sport")
+	result := TokenizeV3("http://example.com/sport")
 	assert.ElementsMatch(t, []string{"example.com", "sport"}, result)
 }
 
 func Test_URLTokenizerOneWordMinSize(t *testing.T) {
 
-	result := TokenizeV2("http://www.test-page.de/aaa/bbb/bc/ccc")
+	result := TokenizeV3("http://www.test-page.de/aaa/bbb/bc/ccc")
 	assert.ElementsMatch(t, []string{"www.test-page.de", "aaa", "bbb", "ccc"}, result)
 }
 
 func Test_URLTokenizerWithScapedChars(t *testing.T) {
-	result := TokenizeV2("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
+	result := TokenizeV3("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
 	assert.ElementsMatch(t, []string{"emetriq", "com", "example.com"}, result)
 }
 
 func Test_URLTokenizerWithWrongEscapedChars(t *testing.T) {
-	result := TokenizeV2("http://example.com/%%ssomething/usefull")
+	result := TokenizeV3("http://example.com/%%ssomething/usefull")
 	assert.Equal(t, []string{"ssomething", "usefull", "example.com"}, result)
 }
 func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
 	DefaultStopWordFunc = IsGermanStopWord
-	result := TokenizeV2("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
+	result := TokenizeV3("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
 	assert.Equal(t, []string{
 		"vermischtes",
-		"article",
 		"marisa",
 		"burger",
 		"rosenheim",
@@ -116,6 +115,18 @@ func Test_URLWithoutHTTPButWithPath(t *testing.T) {
 	assert.ElementsMatch(t, []string{"sports", "www.ironsrc.com"}, result)
 }
 
+func Test_SkipWordsWithNumbers(t *testing.T) {
+	result := TokenizeV3("https://www.autoscout24.at/angebote/seat-altea-xl-reference-1-4-tfsi-motorschaden-benzin-grau-b82ebced-cb95-4f49-8038-5eb1c098e652")
+	// no 'ebced'
+	assert.ElementsMatch(t, []string{"angebote", "seat", "altea", "reference", "tfsi", "motorschaden", "benzin", "grau", "www.autoscout24.at"}, result)
+	assert.NotContains(t, result, "ebced")
+
+	result = TokenizeV3("https://www.coches.net/123nissan-interstar-25dci-120-pro-l2h2-3500-diesel-2009-en-barcelona-52386149-fuvivo.aspx1")
+	// no '123nissan', 'dci' and 'aspx1'
+	assert.ElementsMatch(t, []string{"interstar", "pro", "diesel", "barcelona", "fuvivo", "www.coches.net"}, result)
+	assert.NotContains(t, result, "dci")
+}
+
 func BenchmarkEscapedURLTokenizerV3(b *testing.B) {
 	for n := 0; n < b.N; n++ {
 		TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")