Skip to content

Commit

Permalink
Merge pull request #6 from dnnspaul/main
Browse files Browse the repository at this point in the history
Optimized Tokenize function + numbers won't work as splitter anymore
  • Loading branch information
dnnspaul committed May 11, 2023
2 parents c64f984 + 89b982a commit c31f339
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 32 deletions.
27 changes: 18 additions & 9 deletions README.md
Expand Up @@ -17,9 +17,9 @@ tok.MinWordSize = 3
// set default stop words
tok.DefaultStopWordFunc = IsGermanStopWord

reuslt := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
reuslt := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
// custom stop words
reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
reuslt2 := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
if val == "fussball" {
return true
}
Expand All @@ -31,11 +31,20 @@ reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asom
```
# Benchmark Results

goos: darwin
goos: linux
goarch: amd64
cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
|Benchmark|runs|time/op|B/op|allocs/op|
|---|---|---|---|---|
BenchmarkURLTokenizerV2-12|2026138|605.3 ns/op|256 B/op|1 allocs/op
BenchmarkURLTokenizerV2Fast-12|3609961|330.6 ns/op|256 B/op|1 allocs/op
BenchmarkURLTokenizerV1-12|1766235|676.3 ns/op|272 B/op|2 allocs/op
pkg: github.com/emetriq/gourltokenizer/tokenizer
cpu: 11th Gen Intel(R) Core(TM) i5-11500H @ 2.90GHz
| Benchmark | runs | time/op | B/op | allocs/op |
|-----------------------------------|---------|-------------|----------|-------------|
| BenchmarkEscapedURLTokenizerV3-12 | 1000000 | 1080 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV3-12 | 4751826 | 255.5 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkURLTokenizerV3Fast-12 | 6231590 | 191.6 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkEscapedURLTokenizerV2-12 | 1000000 | 1042 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV2-12 | 3813273 | 484.2 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkURLTokenizerV2Fast-12 | 5835351 | 199.6 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkEscapedURLTokenizerV1-12 | 1942860 | 1084 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV1-12 | 2495599 | 510.7 ns/op | 272 B/op | 2 allocs/op |
| BenchmarkTokenizerV1-12 | 9431893 | 122.9 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkTokenizerV2-12 | 7669710 | 157.0 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkTokenizerV3-12 | 8120326 | 158.3 ns/op | 256 B/op | 1 allocs/op |
144 changes: 138 additions & 6 deletions tokenizer/tokenizer.go
Expand Up @@ -10,15 +10,71 @@ var MinWordSize = 3

var DefaultStopWordFunc = IsEnglishStopWord

func isByteAllowed(b byte, isDotCountMode bool) bool {
if b >= 'a' && b <= 'z' {
return true
}
return isDotCountMode && b != '.' && b != '/'
}

func isRuneAllowed(r rune, isDotCountMode bool) bool {
if r >= 'a' && r <= 'z' {
return true
}
return isDotCountMode && r != '.' && r != '/'
}

//TokenizeV2 splits URL to host and path parts and tokenize path and host part
//all terms are returned in lower case
// faster solution than using strings.Contains(), because we are only looking
// for a single char and can leave the loop after
func stringContainsByteChar(s string, r byte) bool {
for i := 0; i < len(s); i++ {
if s[i] == r {
return true
}
}
return false
}

// TokenizeV3 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case. If numbers are within a word, the complete
// word is filtered out.
func TokenizeV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
encodedURLLower := strings.ToLower(encodedURL)
var result []string

// check if url needs unescaping
if stringContainsByteChar(encodedURLLower, '%') {
decodedURL, err := url.QueryUnescape(encodedURLLower)
if err != nil {
escapedEncodedURL := url.QueryEscape(encodedURL)
decodedURL, err = url.QueryUnescape(escapedEncodedURL)
}

if err != nil {
return []string{}
}

result = filterStopWords(tokenizeV3(decodedURL), stopwordfunc...)
} else {
result = filterStopWords(tokenizeV3(encodedURLLower), stopwordfunc...)
}

return result
}

// TokenizeFastV3 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case
func TokenizeFastV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
urlLower := strings.ToLower(encodedURL)
result := tokenizeV3(urlLower)
if len(stopwordfunc) > 0 {
result = filterStopWords(result, stopwordfunc[0])
}
return result
}

// TokenizeV2 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case
func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
encodedURLLower := strings.ToLower(encodedURL)
decodedURL, err := url.QueryUnescape(encodedURLLower)
Expand All @@ -36,8 +92,8 @@ func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
return result
}

//TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
//all terms are returned in lower case
// TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
// all terms are returned in lower case
func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
urlLower := strings.ToLower(encodedURL)
result := tokenizeV2(urlLower)
Expand All @@ -47,8 +103,8 @@ func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []stri
return result
}

//TokenizeURL splits URL to host and path parts and tokenize path part
//all terms are returned in lower case
// TokenizeURL splits URL to host and path parts and tokenize path part
// all terms are returned in lower case
func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
urlToParse := url
if !strings.HasPrefix(url, "http") && !strings.HasPrefix(url, "mailto") {
Expand All @@ -65,6 +121,82 @@ func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
return append(result, host)
}

func tokenizeV3(str string) []string {
// remove protocol
startIndex := strings.Index(str, "://")
if startIndex < 7 && startIndex > 0 && len(str) > startIndex+3 {
startIndex = startIndex + 3
} else {
startIndex = 0
}

strLen := len(str)
lastIndex := strLen - 1
result := make([]string, 0, strLen/MinWordSize)
start := -1
dotCounter := 0
isDotCountMode := true
isContainingNumber := false
domainNameEndIndex := -1
domainNameStartIndex := startIndex
var b byte
for idx := 0; idx < len(str); idx++ {
b = str[idx]
if idx < startIndex {
continue
}

if isByteAllowed(b, isDotCountMode) {
if start == -1 {
start = idx
}
if idx == lastIndex && ((lastIndex-start+1) >= MinWordSize || isDotCountMode) {
if !isContainingNumber {
result = append(result, str[start:strLen])
}
isContainingNumber = false
}
} else if b >= '0' && b <= '9' && !isDotCountMode {
isContainingNumber = true
} else if ((idx-start) >= MinWordSize || isDotCountMode) && start > -1 {
if !isContainingNumber {
result = append(result, str[start:idx])
}

isContainingNumber = false
start = -1
} else {
isContainingNumber = false
start = -1
}
if b == '/' && isDotCountMode {
isDotCountMode = false
domainNameEndIndex = idx
dotCounter = len(result) - 1
}

if b == '?' { // skip query params
break
}
}

if isDotCountMode {
dotCounter = len(result) - 1
domainNameEndIndex = len(str)
}

if dotCounter > 0 && len(result) > 1 {
result = append(result[:(dotCounter-1)], result[dotCounter+1:]...)
if domainNameEndIndex-domainNameStartIndex > 3 { // if domain name is longer than 3 chars
for len(str) > domainNameStartIndex && str[domainNameStartIndex] == '.' {
domainNameStartIndex++
}
result = append(result, str[domainNameStartIndex:domainNameEndIndex])
}
}
return result
}

func tokenizeV2(str string) []string {
// remove protocol
startIndex := strings.Index(str, "://")
Expand Down
81 changes: 64 additions & 17 deletions tokenizer/tokenizer_test.go
Expand Up @@ -11,25 +11,25 @@ func init() {
}
func Test_tokenizeCorrectPath(t *testing.T) {
path := "/some-thing/very/interesting?queryparam2=1&queryparam2=3"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
}

func Test_tokenizePathWithDashes(t *testing.T) {
path := "/some-thing/very/interesting"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
}

func Test_tokenizePathWithDashes2(t *testing.T) {
path := "/hsv-fussball"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"hsv", "fussball"}, result)
}

func Test_tokenizeEmptyString(t *testing.T) {
path := ""
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{}, result)
}

Expand All @@ -47,36 +47,35 @@ func Test_filterStopWorlds(t *testing.T) {
}

func Test_URLTokenizer(t *testing.T) {
result := TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
result := TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
assert.ElementsMatch(t, []string{"path", "sport", "hsv", "fussball", "example.com"}, result)
}

func Test_URLTokenizerOneWord(t *testing.T) {
result := TokenizeV2("http://example.com/sport")
result := TokenizeV3("http://example.com/sport")
assert.ElementsMatch(t, []string{"example.com", "sport"}, result)
}

func Test_URLTokenizerOneWordMinSize(t *testing.T) {

result := TokenizeV2("http://www.test-page.de/aaa/bbb/bc/ccc")
result := TokenizeV3("http://www.test-page.de/aaa/bbb/bc/ccc")
assert.ElementsMatch(t, []string{"www.test-page.de", "aaa", "bbb", "ccc"}, result)
}

func Test_URLTokenizerWithScapedChars(t *testing.T) {
result := TokenizeV2("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
result := TokenizeV3("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
assert.ElementsMatch(t, []string{"emetriq", "com", "example.com"}, result)
}

func Test_URLTokenizerWithWrongEscapedChars(t *testing.T) {
result := TokenizeV2("http://example.com/%%ssomething/usefull")
result := TokenizeV3("http://example.com/%%ssomething/usefull")
assert.Equal(t, []string{"ssomething", "usefull", "example.com"}, result)
}
func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
DefaultStopWordFunc = IsGermanStopWord
result := TokenizeV2("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
result := TokenizeV3("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
assert.Equal(t, []string{
"vermischtes",
"article",
"marisa",
"burger",
"rosenheim",
Expand All @@ -86,36 +85,72 @@ func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
}, result)
}
func Test_URLTokenizerWithWrongHostEscapedChars(t *testing.T) {
result := TokenizeV2("http://..example.com/something")
result := TokenizeV3("http://..example.com/something")
assert.Equal(t, []string{"something", "example.com"}, result)
}

func Test_URLTokenizerWithCapitalChars(t *testing.T) {
DefaultStopWordFunc = IsGermanStopWord
result := TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
result := TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
assert.ElementsMatch(t, []string{"subdomain", "hsv", "fussbal", "some", "www.subdomain.example.com"}, result)
}

func Test_URLWithoutHTTP(t *testing.T) {
result := TokenizeV2("www.Subdomain.example.com")
result := TokenizeV3("www.Subdomain.example.com")
assert.ElementsMatch(t, []string{"subdomain", "www.subdomain.example.com"}, result)
}

func Test_URLWithoutHTTPAndWithoutSubdomain(t *testing.T) {
result := TokenizeV2("www.example.com")
result := TokenizeV3("www.example.com")
assert.ElementsMatch(t, []string{"www.example.com"}, result)
}

func Test_URLWithoutHTTPAndSubdomain(t *testing.T) {
result := TokenizeV2("sport.fussball.example.com")
result := TokenizeV3("sport.fussball.example.com")
assert.ElementsMatch(t, []string{"sport", "fussball", "sport.fussball.example.com"}, result)
}

func Test_URLWithoutHTTPButWithPath(t *testing.T) {
result := TokenizeV2("www.ironsrc.com/sports")
result := TokenizeV3("www.ironsrc.com/sports")
assert.ElementsMatch(t, []string{"sports", "www.ironsrc.com"}, result)
}

func Test_SkipWordsWithNumbers(t *testing.T) {
result := TokenizeV3("https://www.autoscout24.at/angebote/seat-altea-xl-reference-1-4-tfsi-motorschaden-benzin-grau-b82ebced-cb95-4f49-8038-5eb1c098e652")
// no 'ebced'
assert.ElementsMatch(t, []string{"angebote", "seat", "altea", "reference", "tfsi", "motorschaden", "benzin", "grau", "www.autoscout24.at"}, result)
assert.NotContains(t, result, "ebced")

result = TokenizeV3("https://www.coches.net/123nissan-interstar-25dci-120-pro-l2h2-3500-diesel-2009-en-barcelona-52386149-fuvivo.aspx1")
// no '123nissan', 'dci' and 'aspx1'
assert.ElementsMatch(t, []string{"interstar", "pro", "diesel", "barcelona", "fuvivo", "www.coches.net"}, result)
assert.NotContains(t, result, "dci")
}

func BenchmarkEscapedURLTokenizerV3(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
}
}

func BenchmarkURLTokenizerV3(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
}
}

func BenchmarkURLTokenizerV3Fast(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeFastV3("http://example.com/path/sport/hsv-fussball?bla=1")
}
}

func BenchmarkEscapedURLTokenizerV2(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
}
}

func BenchmarkURLTokenizerV2(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
Expand All @@ -128,6 +163,12 @@ func BenchmarkURLTokenizerV2Fast(b *testing.B) {
}
}

func BenchmarkEscapedURLTokenizerV1(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
}
}

func BenchmarkURLTokenizerV1(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV1("http://example.com/path/sport/hsv-fussball?bla=1")
Expand All @@ -144,3 +185,9 @@ func BenchmarkTokenizerV2(b *testing.B) {
tokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
}
}

func BenchmarkTokenizerV3(b *testing.B) {
for n := 0; n < b.N; n++ {
tokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
}
}

0 comments on commit c31f339

Please sign in to comment.