Merge pull request #6 from dnnspaul/main

Optimized Tokenize function + numbers won't work as splitter anymore
emetriq · May 11, 2023 · c31f339 · c31f339
2 parents c64f984 + 89b982a
commit c31f339
Show file tree

Hide file tree

Showing 3 changed files with 220 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -17,9 +17,9 @@ tok.MinWordSize = 3
 // set default stop words
 tok.DefaultStopWordFunc = IsGermanStopWord
 
-reuslt := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
+reuslt := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
 // custom stop words
-reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
+reuslt2 := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
 	if val == "fussball" {
 		return true
 	}
@@ -31,11 +31,20 @@ reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asom
 ```
 # Benchmark Results
 
-goos: darwin
+goos: linux
 goarch: amd64
-cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-|Benchmark|runs|time/op|B/op|allocs/op|
-|---|---|---|---|---|
-BenchmarkURLTokenizerV2-12|2026138|605.3 ns/op|256 B/op|1 allocs/op
-BenchmarkURLTokenizerV2Fast-12|3609961|330.6 ns/op|256 B/op|1 allocs/op
-BenchmarkURLTokenizerV1-12|1766235|676.3 ns/op|272 B/op|2 allocs/op
+pkg: github.com/emetriq/gourltokenizer/tokenizer
+cpu: 11th Gen Intel(R) Core(TM) i5-11500H @ 2.90GHz
+| Benchmark                         | runs    | time/op     | B/op     | allocs/op   |
+|-----------------------------------|---------|-------------|----------|-------------|
+| BenchmarkEscapedURLTokenizerV3-12 | 1000000 | 1080 ns/op  | 496 B/op | 3 allocs/op |
+| BenchmarkURLTokenizerV3-12        | 4751826 | 255.5 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkURLTokenizerV3Fast-12    | 6231590 | 191.6 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkEscapedURLTokenizerV2-12 | 1000000 | 1042 ns/op  | 496 B/op | 3 allocs/op |
+| BenchmarkURLTokenizerV2-12        | 3813273 | 484.2 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkURLTokenizerV2Fast-12    | 5835351 | 199.6 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkEscapedURLTokenizerV1-12 | 1942860 | 1084 ns/op  | 496 B/op | 3 allocs/op |
+| BenchmarkURLTokenizerV1-12        | 2495599 | 510.7 ns/op | 272 B/op | 2 allocs/op |
+| BenchmarkTokenizerV1-12           | 9431893 | 122.9 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkTokenizerV2-12           | 7669710 | 157.0 ns/op | 256 B/op | 1 allocs/op |
+| BenchmarkTokenizerV3-12           | 8120326 | 158.3 ns/op | 256 B/op | 1 allocs/op |
diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go
@@ -10,15 +10,71 @@ var MinWordSize = 3
 
 var DefaultStopWordFunc = IsEnglishStopWord
 
+func isByteAllowed(b byte, isDotCountMode bool) bool {
+	if b >= 'a' && b <= 'z' {
+		return true
+	}
+	return isDotCountMode && b != '.' && b != '/'
+}
+
 func isRuneAllowed(r rune, isDotCountMode bool) bool {
 	if r >= 'a' && r <= 'z' {
 		return true
 	}
 	return isDotCountMode && r != '.' && r != '/'
 }
 
-//TokenizeV2 splits URL to host and path parts and tokenize path and host part
-//all terms are returned in lower case
+// faster solution than using strings.Contains(), because we are only looking
+// for a single char and can leave the loop after
+func stringContainsByteChar(s string, r byte) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] == r {
+			return true
+		}
+	}
+	return false
+}
+
+// TokenizeV3 splits URL to host and path parts and tokenize path and host part
+// all terms are returned in lower case. If numbers are within a word, the complete
+// word is filtered out.
+func TokenizeV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
+	encodedURLLower := strings.ToLower(encodedURL)
+	var result []string
+
+	// check if url needs unescaping
+	if stringContainsByteChar(encodedURLLower, '%') {
+		decodedURL, err := url.QueryUnescape(encodedURLLower)
+		if err != nil {
+			escapedEncodedURL := url.QueryEscape(encodedURL)
+			decodedURL, err = url.QueryUnescape(escapedEncodedURL)
+		}
+
+		if err != nil {
+			return []string{}
+		}
+
+		result = filterStopWords(tokenizeV3(decodedURL), stopwordfunc...)
+	} else {
+		result = filterStopWords(tokenizeV3(encodedURLLower), stopwordfunc...)
+	}
+
+	return result
+}
+
+// TokenizeFastV3 splits URL to host and path parts and tokenize path and host part
+// all terms are returned in lower case
+func TokenizeFastV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
+	urlLower := strings.ToLower(encodedURL)
+	result := tokenizeV3(urlLower)
+	if len(stopwordfunc) > 0 {
+		result = filterStopWords(result, stopwordfunc[0])
+	}
+	return result
+}
+
+// TokenizeV2 splits URL to host and path parts and tokenize path and host part
+// all terms are returned in lower case
 func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
 	encodedURLLower := strings.ToLower(encodedURL)
 	decodedURL, err := url.QueryUnescape(encodedURLLower)
@@ -36,8 +92,8 @@ func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
 	return result
 }
 
-//TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
-//all terms are returned in lower case
+// TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
+// all terms are returned in lower case
 func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
 	urlLower := strings.ToLower(encodedURL)
 	result := tokenizeV2(urlLower)
@@ -47,8 +103,8 @@ func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []stri
 	return result
 }
 
-//TokenizeURL splits URL to host and path parts and tokenize path part
-//all terms are returned in lower case
+// TokenizeURL splits URL to host and path parts and tokenize path part
+// all terms are returned in lower case
 func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
 	urlToParse := url
 	if !strings.HasPrefix(url, "http") && !strings.HasPrefix(url, "mailto") {
@@ -65,6 +121,82 @@ func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
 	return append(result, host)
 }
 
+func tokenizeV3(str string) []string {
+	// remove protocol
+	startIndex := strings.Index(str, "://")
+	if startIndex < 7 && startIndex > 0 && len(str) > startIndex+3 {
+		startIndex = startIndex + 3
+	} else {
+		startIndex = 0
+	}
+
+	strLen := len(str)
+	lastIndex := strLen - 1
+	result := make([]string, 0, strLen/MinWordSize)
+	start := -1
+	dotCounter := 0
+	isDotCountMode := true
+	isContainingNumber := false
+	domainNameEndIndex := -1
+	domainNameStartIndex := startIndex
+	var b byte
+	for idx := 0; idx < len(str); idx++ {
+		b = str[idx]
+		if idx < startIndex {
+			continue
+		}
+
+		if isByteAllowed(b, isDotCountMode) {
+			if start == -1 {
+				start = idx
+			}
+			if idx == lastIndex && ((lastIndex-start+1) >= MinWordSize || isDotCountMode) {
+				if !isContainingNumber {
+					result = append(result, str[start:strLen])
+				}
+				isContainingNumber = false
+			}
+		} else if b >= '0' && b <= '9' && !isDotCountMode {
+			isContainingNumber = true
+		} else if ((idx-start) >= MinWordSize || isDotCountMode) && start > -1 {
+			if !isContainingNumber {
+				result = append(result, str[start:idx])
+			}
+
+			isContainingNumber = false
+			start = -1
+		} else {
+			isContainingNumber = false
+			start = -1
+		}
+		if b == '/' && isDotCountMode {
+			isDotCountMode = false
+			domainNameEndIndex = idx
+			dotCounter = len(result) - 1
+		}
+
+		if b == '?' { // skip query params
+			break
+		}
+	}
+
+	if isDotCountMode {
+		dotCounter = len(result) - 1
+		domainNameEndIndex = len(str)
+	}
+
+	if dotCounter > 0 && len(result) > 1 {
+		result = append(result[:(dotCounter-1)], result[dotCounter+1:]...)
+		if domainNameEndIndex-domainNameStartIndex > 3 { // if domain name is longer than 3 chars
+			for len(str) > domainNameStartIndex && str[domainNameStartIndex] == '.' {
+				domainNameStartIndex++
+			}
+			result = append(result, str[domainNameStartIndex:domainNameEndIndex])
+		}
+	}
+	return result
+}
+
 func tokenizeV2(str string) []string {
 	// remove protocol
 	startIndex := strings.Index(str, "://")

diff --git a/tokenizer/tokenizer_test.go b/tokenizer/tokenizer_test.go
@@ -11,25 +11,25 @@ func init() {
 }
 func Test_tokenizeCorrectPath(t *testing.T) {
 	path := "/some-thing/very/interesting?queryparam2=1&queryparam2=3"
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
 }
 
 func Test_tokenizePathWithDashes(t *testing.T) {
 	path := "/some-thing/very/interesting"
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
 }
 
 func Test_tokenizePathWithDashes2(t *testing.T) {
 	path := "/hsv-fussball"
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{"hsv", "fussball"}, result)
 }
 
 func Test_tokenizeEmptyString(t *testing.T) {
 	path := ""
-	result := tokenizeV2(path)
+	result := tokenizeV3(path)
 	assert.ElementsMatch(t, []string{}, result)
 }
 
@@ -47,36 +47,35 @@ func Test_filterStopWorlds(t *testing.T) {
 }
 
 func Test_URLTokenizer(t *testing.T) {
-	result := TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
+	result := TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
 	assert.ElementsMatch(t, []string{"path", "sport", "hsv", "fussball", "example.com"}, result)
 }
 
 func Test_URLTokenizerOneWord(t *testing.T) {
-	result := TokenizeV2("http://example.com/sport")
+	result := TokenizeV3("http://example.com/sport")
 	assert.ElementsMatch(t, []string{"example.com", "sport"}, result)
 }
 
 func Test_URLTokenizerOneWordMinSize(t *testing.T) {
 
-	result := TokenizeV2("http://www.test-page.de/aaa/bbb/bc/ccc")
+	result := TokenizeV3("http://www.test-page.de/aaa/bbb/bc/ccc")
 	assert.ElementsMatch(t, []string{"www.test-page.de", "aaa", "bbb", "ccc"}, result)
 }
 
 func Test_URLTokenizerWithScapedChars(t *testing.T) {
-	result := TokenizeV2("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
+	result := TokenizeV3("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
 	assert.ElementsMatch(t, []string{"emetriq", "com", "example.com"}, result)
 }
 
 func Test_URLTokenizerWithWrongEscapedChars(t *testing.T) {
-	result := TokenizeV2("http://example.com/%%ssomething/usefull")
+	result := TokenizeV3("http://example.com/%%ssomething/usefull")
 	assert.Equal(t, []string{"ssomething", "usefull", "example.com"}, result)
 }
 func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
 	DefaultStopWordFunc = IsGermanStopWord
-	result := TokenizeV2("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
+	result := TokenizeV3("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
 	assert.Equal(t, []string{
 		"vermischtes",
-		"article",
 		"marisa",
 		"burger",
 		"rosenheim",
@@ -86,36 +85,72 @@ func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
 	}, result)
 }
 func Test_URLTokenizerWithWrongHostEscapedChars(t *testing.T) {
-	result := TokenizeV2("http://..example.com/something")
+	result := TokenizeV3("http://..example.com/something")
 	assert.Equal(t, []string{"something", "example.com"}, result)
 }
 
 func Test_URLTokenizerWithCapitalChars(t *testing.T) {
 	DefaultStopWordFunc = IsGermanStopWord
-	result := TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
+	result := TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
 	assert.ElementsMatch(t, []string{"subdomain", "hsv", "fussbal", "some", "www.subdomain.example.com"}, result)
 }
 
 func Test_URLWithoutHTTP(t *testing.T) {
-	result := TokenizeV2("www.Subdomain.example.com")
+	result := TokenizeV3("www.Subdomain.example.com")
 	assert.ElementsMatch(t, []string{"subdomain", "www.subdomain.example.com"}, result)
 }
 
 func Test_URLWithoutHTTPAndWithoutSubdomain(t *testing.T) {
-	result := TokenizeV2("www.example.com")
+	result := TokenizeV3("www.example.com")
 	assert.ElementsMatch(t, []string{"www.example.com"}, result)
 }
 
 func Test_URLWithoutHTTPAndSubdomain(t *testing.T) {
-	result := TokenizeV2("sport.fussball.example.com")
+	result := TokenizeV3("sport.fussball.example.com")
 	assert.ElementsMatch(t, []string{"sport", "fussball", "sport.fussball.example.com"}, result)
 }
 
 func Test_URLWithoutHTTPButWithPath(t *testing.T) {
-	result := TokenizeV2("www.ironsrc.com/sports")
+	result := TokenizeV3("www.ironsrc.com/sports")
 	assert.ElementsMatch(t, []string{"sports", "www.ironsrc.com"}, result)
 }
 
+func Test_SkipWordsWithNumbers(t *testing.T) {
+	result := TokenizeV3("https://www.autoscout24.at/angebote/seat-altea-xl-reference-1-4-tfsi-motorschaden-benzin-grau-b82ebced-cb95-4f49-8038-5eb1c098e652")
+	// no 'ebced'
+	assert.ElementsMatch(t, []string{"angebote", "seat", "altea", "reference", "tfsi", "motorschaden", "benzin", "grau", "www.autoscout24.at"}, result)
+	assert.NotContains(t, result, "ebced")
+
+	result = TokenizeV3("https://www.coches.net/123nissan-interstar-25dci-120-pro-l2h2-3500-diesel-2009-en-barcelona-52386149-fuvivo.aspx1")
+	// no '123nissan', 'dci' and 'aspx1'
+	assert.ElementsMatch(t, []string{"interstar", "pro", "diesel", "barcelona", "fuvivo", "www.coches.net"}, result)
+	assert.NotContains(t, result, "dci")
+}
+
+func BenchmarkEscapedURLTokenizerV3(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
+	}
+}
+
+func BenchmarkURLTokenizerV3(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
+	}
+}
+
+func BenchmarkURLTokenizerV3Fast(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		TokenizeFastV3("http://example.com/path/sport/hsv-fussball?bla=1")
+	}
+}
+
+func BenchmarkEscapedURLTokenizerV2(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
+	}
+}
+
 func BenchmarkURLTokenizerV2(b *testing.B) {
 	for n := 0; n < b.N; n++ {
 		TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
@@ -128,6 +163,12 @@ func BenchmarkURLTokenizerV2Fast(b *testing.B) {
 	}
 }
 
+func BenchmarkEscapedURLTokenizerV1(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
+	}
+}
+
 func BenchmarkURLTokenizerV1(b *testing.B) {
 	for n := 0; n < b.N; n++ {
 		TokenizeV1("http://example.com/path/sport/hsv-fussball?bla=1")
@@ -144,3 +185,9 @@ func BenchmarkTokenizerV2(b *testing.B) {
 		tokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
 	}
 }
+
+func BenchmarkTokenizerV3(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		tokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
+	}
+}