Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized Tokenize function #6

Merged
merged 2 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ tok.MinWordSize = 3
// set default stop words
tok.DefaultStopWordFunc = IsGermanStopWord

reuslt := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
reuslt := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
// custom stop words
reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
reuslt2 := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
if val == "fussball" {
return true
}
Expand All @@ -31,11 +31,20 @@ reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asom
```
# Benchmark Results

goos: darwin
goos: linux
goarch: amd64
cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
|Benchmark|runs|time/op|B/op|allocs/op|
|---|---|---|---|---|
BenchmarkURLTokenizerV2-12|2026138|605.3 ns/op|256 B/op|1 allocs/op
BenchmarkURLTokenizerV2Fast-12|3609961|330.6 ns/op|256 B/op|1 allocs/op
BenchmarkURLTokenizerV1-12|1766235|676.3 ns/op|272 B/op|2 allocs/op
pkg: github.com/emetriq/gourltokenizer/tokenizer
cpu: 11th Gen Intel(R) Core(TM) i5-11500H @ 2.90GHz
| Benchmark | runs | time/op | B/op | allocs/op |
|-----------------------------------|---------|-------------|----------|-------------|
| BenchmarkEscapedURLTokenizerV3-12 | 1000000 | 1080 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV3-12 | 4751826 | 255.5 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkURLTokenizerV3Fast-12 | 6231590 | 191.6 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkEscapedURLTokenizerV2-12 | 1000000 | 1042 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV2-12 | 3813273 | 484.2 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkURLTokenizerV2Fast-12 | 5835351 | 199.6 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkEscapedURLTokenizerV1-12 | 1942860 | 1084 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV1-12 | 2495599 | 510.7 ns/op | 272 B/op | 2 allocs/op |
| BenchmarkTokenizerV1-12 | 9431893 | 122.9 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkTokenizerV2-12 | 7669710 | 157.0 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkTokenizerV3-12 | 8120326 | 158.3 ns/op | 256 B/op | 1 allocs/op |
144 changes: 138 additions & 6 deletions tokenizer/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,71 @@ var MinWordSize = 3

var DefaultStopWordFunc = IsEnglishStopWord

func isByteAllowed(b byte, isDotCountMode bool) bool {
if b >= 'a' && b <= 'z' {
return true
}
return isDotCountMode && b != '.' && b != '/'
}

func isRuneAllowed(r rune, isDotCountMode bool) bool {
if r >= 'a' && r <= 'z' {
return true
}
return isDotCountMode && r != '.' && r != '/'
}

//TokenizeV2 splits URL to host and path parts and tokenize path and host part
//all terms are returned in lower case
// faster solution than using strings.Contains(), because we are only looking
// for a single char and can leave the loop after
func stringContainsByteChar(s string, r byte) bool {
for i := 0; i < len(s); i++ {
if s[i] == r {
return true
}
}
return false
}

// TokenizeV3 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case. If numbers are within a word, the complete
// word is filtered out.
func TokenizeV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
encodedURLLower := strings.ToLower(encodedURL)
var result []string

// check if url needs unescaping
if stringContainsByteChar(encodedURLLower, '%') {
decodedURL, err := url.QueryUnescape(encodedURLLower)
if err != nil {
escapedEncodedURL := url.QueryEscape(encodedURL)
decodedURL, err = url.QueryUnescape(escapedEncodedURL)
}

if err != nil {
return []string{}
}

result = filterStopWords(tokenizeV3(decodedURL), stopwordfunc...)
} else {
result = filterStopWords(tokenizeV3(encodedURLLower), stopwordfunc...)
}

return result
}

// TokenizeFastV3 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case
func TokenizeFastV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
urlLower := strings.ToLower(encodedURL)
result := tokenizeV3(urlLower)
if len(stopwordfunc) > 0 {
result = filterStopWords(result, stopwordfunc[0])
}
return result
}

// TokenizeV2 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case
func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
encodedURLLower := strings.ToLower(encodedURL)
decodedURL, err := url.QueryUnescape(encodedURLLower)
Expand All @@ -36,8 +92,8 @@ func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
return result
}

//TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
//all terms are returned in lower case
// TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
// all terms are returned in lower case
func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
urlLower := strings.ToLower(encodedURL)
result := tokenizeV2(urlLower)
Expand All @@ -47,8 +103,8 @@ func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []stri
return result
}

//TokenizeURL splits URL to host and path parts and tokenize path part
//all terms are returned in lower case
// TokenizeURL splits URL to host and path parts and tokenize path part
// all terms are returned in lower case
func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
urlToParse := url
if !strings.HasPrefix(url, "http") && !strings.HasPrefix(url, "mailto") {
Expand All @@ -65,6 +121,82 @@ func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
return append(result, host)
}

func tokenizeV3(str string) []string {
// remove protocol
startIndex := strings.Index(str, "://")
if startIndex < 7 && startIndex > 0 && len(str) > startIndex+3 {
startIndex = startIndex + 3
} else {
startIndex = 0
}

strLen := len(str)
lastIndex := strLen - 1
result := make([]string, 0, strLen/MinWordSize)
start := -1
dotCounter := 0
isDotCountMode := true
isContainingNumber := false
domainNameEndIndex := -1
domainNameStartIndex := startIndex
var b byte
for idx := 0; idx < len(str); idx++ {
b = str[idx]
if idx < startIndex {
continue
}

if isByteAllowed(b, isDotCountMode) {
if start == -1 {
start = idx
}
if idx == lastIndex && ((lastIndex-start+1) >= MinWordSize || isDotCountMode) {
if !isContainingNumber {
result = append(result, str[start:strLen])
}
isContainingNumber = false
}
} else if b >= '0' && b <= '9' && !isDotCountMode {
isContainingNumber = true
} else if ((idx-start) >= MinWordSize || isDotCountMode) && start > -1 {
if !isContainingNumber {
result = append(result, str[start:idx])
}

isContainingNumber = false
start = -1
} else {
isContainingNumber = false
start = -1
}
if b == '/' && isDotCountMode {
isDotCountMode = false
domainNameEndIndex = idx
dotCounter = len(result) - 1
}

if b == '?' { // skip query params
break
}
}

if isDotCountMode {
dotCounter = len(result) - 1
domainNameEndIndex = len(str)
}

if dotCounter > 0 && len(result) > 1 {
result = append(result[:(dotCounter-1)], result[dotCounter+1:]...)
if domainNameEndIndex-domainNameStartIndex > 3 { // if domain name is longer than 3 chars
for len(str) > domainNameStartIndex && str[domainNameStartIndex] == '.' {
domainNameStartIndex++
}
result = append(result, str[domainNameStartIndex:domainNameEndIndex])
}
}
return result
}

func tokenizeV2(str string) []string {
// remove protocol
startIndex := strings.Index(str, "://")
Expand Down
81 changes: 64 additions & 17 deletions tokenizer/tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,25 @@ func init() {
}
func Test_tokenizeCorrectPath(t *testing.T) {
path := "/some-thing/very/interesting?queryparam2=1&queryparam2=3"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
}

func Test_tokenizePathWithDashes(t *testing.T) {
path := "/some-thing/very/interesting"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
}

func Test_tokenizePathWithDashes2(t *testing.T) {
path := "/hsv-fussball"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"hsv", "fussball"}, result)
}

func Test_tokenizeEmptyString(t *testing.T) {
path := ""
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{}, result)
}

Expand All @@ -47,36 +47,35 @@ func Test_filterStopWorlds(t *testing.T) {
}

func Test_URLTokenizer(t *testing.T) {
result := TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
result := TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
assert.ElementsMatch(t, []string{"path", "sport", "hsv", "fussball", "example.com"}, result)
}

func Test_URLTokenizerOneWord(t *testing.T) {
result := TokenizeV2("http://example.com/sport")
result := TokenizeV3("http://example.com/sport")
assert.ElementsMatch(t, []string{"example.com", "sport"}, result)
}

func Test_URLTokenizerOneWordMinSize(t *testing.T) {

result := TokenizeV2("http://www.test-page.de/aaa/bbb/bc/ccc")
result := TokenizeV3("http://www.test-page.de/aaa/bbb/bc/ccc")
assert.ElementsMatch(t, []string{"www.test-page.de", "aaa", "bbb", "ccc"}, result)
}

func Test_URLTokenizerWithScapedChars(t *testing.T) {
result := TokenizeV2("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
result := TokenizeV3("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
assert.ElementsMatch(t, []string{"emetriq", "com", "example.com"}, result)
}

func Test_URLTokenizerWithWrongEscapedChars(t *testing.T) {
result := TokenizeV2("http://example.com/%%ssomething/usefull")
result := TokenizeV3("http://example.com/%%ssomething/usefull")
assert.Equal(t, []string{"ssomething", "usefull", "example.com"}, result)
}
func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
DefaultStopWordFunc = IsGermanStopWord
result := TokenizeV2("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
result := TokenizeV3("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
assert.Equal(t, []string{
"vermischtes",
"article",
"marisa",
"burger",
"rosenheim",
Expand All @@ -86,36 +85,72 @@ func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
}, result)
}
func Test_URLTokenizerWithWrongHostEscapedChars(t *testing.T) {
result := TokenizeV2("http://..example.com/something")
result := TokenizeV3("http://..example.com/something")
assert.Equal(t, []string{"something", "example.com"}, result)
}

func Test_URLTokenizerWithCapitalChars(t *testing.T) {
DefaultStopWordFunc = IsGermanStopWord
result := TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
result := TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
assert.ElementsMatch(t, []string{"subdomain", "hsv", "fussbal", "some", "www.subdomain.example.com"}, result)
}

func Test_URLWithoutHTTP(t *testing.T) {
result := TokenizeV2("www.Subdomain.example.com")
result := TokenizeV3("www.Subdomain.example.com")
assert.ElementsMatch(t, []string{"subdomain", "www.subdomain.example.com"}, result)
}

func Test_URLWithoutHTTPAndWithoutSubdomain(t *testing.T) {
result := TokenizeV2("www.example.com")
result := TokenizeV3("www.example.com")
assert.ElementsMatch(t, []string{"www.example.com"}, result)
}

func Test_URLWithoutHTTPAndSubdomain(t *testing.T) {
result := TokenizeV2("sport.fussball.example.com")
result := TokenizeV3("sport.fussball.example.com")
assert.ElementsMatch(t, []string{"sport", "fussball", "sport.fussball.example.com"}, result)
}

func Test_URLWithoutHTTPButWithPath(t *testing.T) {
result := TokenizeV2("www.ironsrc.com/sports")
result := TokenizeV3("www.ironsrc.com/sports")
assert.ElementsMatch(t, []string{"sports", "www.ironsrc.com"}, result)
}

func Test_SkipWordsWithNumbers(t *testing.T) {
result := TokenizeV3("https://www.autoscout24.at/angebote/seat-altea-xl-reference-1-4-tfsi-motorschaden-benzin-grau-b82ebced-cb95-4f49-8038-5eb1c098e652")
// no 'ebced'
assert.ElementsMatch(t, []string{"angebote", "seat", "altea", "reference", "tfsi", "motorschaden", "benzin", "grau", "www.autoscout24.at"}, result)
assert.NotContains(t, result, "ebced")

result = TokenizeV3("https://www.coches.net/123nissan-interstar-25dci-120-pro-l2h2-3500-diesel-2009-en-barcelona-52386149-fuvivo.aspx1")
// no '123nissan', 'dci' and 'aspx1'
assert.ElementsMatch(t, []string{"interstar", "pro", "diesel", "barcelona", "fuvivo", "www.coches.net"}, result)
assert.NotContains(t, result, "dci")
}

func BenchmarkEscapedURLTokenizerV3(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
}
}

func BenchmarkURLTokenizerV3(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
}
}

func BenchmarkURLTokenizerV3Fast(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeFastV3("http://example.com/path/sport/hsv-fussball?bla=1")
}
}

func BenchmarkEscapedURLTokenizerV2(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
}
}

func BenchmarkURLTokenizerV2(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
Expand All @@ -128,6 +163,12 @@ func BenchmarkURLTokenizerV2Fast(b *testing.B) {
}
}

func BenchmarkEscapedURLTokenizerV1(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
}
}

func BenchmarkURLTokenizerV1(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV1("http://example.com/path/sport/hsv-fussball?bla=1")
Expand All @@ -144,3 +185,9 @@ func BenchmarkTokenizerV2(b *testing.B) {
tokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
}
}

func BenchmarkTokenizerV3(b *testing.B) {
for n := 0; n < b.N; n++ {
tokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
}
}