Skip to content

Commit

Permalink
skip words with numbers
Browse files Browse the repository at this point in the history
  • Loading branch information
Dennis Paul committed May 10, 2023
1 parent 9f95bd8 commit 89b982a
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 29 deletions.
27 changes: 18 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ tok.MinWordSize = 3
// set default stop words
tok.DefaultStopWordFunc = IsGermanStopWord

reuslt := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
reuslt := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a")
// custom stop words
reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
reuslt2 := tok.TokenizeV3("mailto://www.Subdomain.example.com/HSV-fussball%3asome/a", func(val string) bool {
if val == "fussball" {
return true
}
Expand All @@ -31,11 +31,20 @@ reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussball%3asom
```
# Benchmark Results

goos: darwin
goos: linux
goarch: amd64
cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
|Benchmark|runs|time/op|B/op|allocs/op|
|---|---|---|---|---|
BenchmarkURLTokenizerV2-12|2026138|605.3 ns/op|256 B/op|1 allocs/op
BenchmarkURLTokenizerV2Fast-12|3609961|330.6 ns/op|256 B/op|1 allocs/op
BenchmarkURLTokenizerV1-12|1766235|676.3 ns/op|272 B/op|2 allocs/op
pkg: github.com/emetriq/gourltokenizer/tokenizer
cpu: 11th Gen Intel(R) Core(TM) i5-11500H @ 2.90GHz
| Benchmark | runs | time/op | B/op | allocs/op |
|-----------------------------------|---------|-------------|----------|-------------|
| BenchmarkEscapedURLTokenizerV3-12 | 1000000 | 1080 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV3-12 | 4751826 | 255.5 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkURLTokenizerV3Fast-12 | 6231590 | 191.6 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkEscapedURLTokenizerV2-12 | 1000000 | 1042 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV2-12 | 3813273 | 484.2 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkURLTokenizerV2Fast-12 | 5835351 | 199.6 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkEscapedURLTokenizerV1-12 | 1942860 | 1084 ns/op | 496 B/op | 3 allocs/op |
| BenchmarkURLTokenizerV1-12 | 2495599 | 510.7 ns/op | 272 B/op | 2 allocs/op |
| BenchmarkTokenizerV1-12 | 9431893 | 122.9 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkTokenizerV2-12 | 7669710 | 157.0 ns/op | 256 B/op | 1 allocs/op |
| BenchmarkTokenizerV3-12 | 8120326 | 158.3 ns/op | 256 B/op | 1 allocs/op |
30 changes: 21 additions & 9 deletions tokenizer/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ func stringContainsByteChar(s string, r byte) bool {
}

// TokenizeV3 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case
// all terms are returned in lower case. If numbers are within a word, the complete
// word is filtered out.
func TokenizeV3(encodedURL string, stopwordfunc ...func(string) bool) []string {
encodedURLLower := strings.ToLower(encodedURL)
var result []string
Expand Down Expand Up @@ -72,8 +73,8 @@ func TokenizeFastV3(encodedURL string, stopwordfunc ...func(string) bool) []stri
return result
}

//TokenizeV2 splits URL to host and path parts and tokenize path and host part
//all terms are returned in lower case
// TokenizeV2 splits URL to host and path parts and tokenize path and host part
// all terms are returned in lower case
func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
encodedURLLower := strings.ToLower(encodedURL)
decodedURL, err := url.QueryUnescape(encodedURLLower)
Expand All @@ -91,8 +92,8 @@ func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
return result
}

//TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
//all terms are returned in lower case
// TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding
// all terms are returned in lower case
func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []string {
urlLower := strings.ToLower(encodedURL)
result := tokenizeV2(urlLower)
Expand All @@ -102,8 +103,8 @@ func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []stri
return result
}

//TokenizeURL splits URL to host and path parts and tokenize path part
//all terms are returned in lower case
// TokenizeURL splits URL to host and path parts and tokenize path part
// all terms are returned in lower case
func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string {
urlToParse := url
if !strings.HasPrefix(url, "http") && !strings.HasPrefix(url, "mailto") {
Expand Down Expand Up @@ -135,6 +136,7 @@ func tokenizeV3(str string) []string {
start := -1
dotCounter := 0
isDotCountMode := true
isContainingNumber := false
domainNameEndIndex := -1
domainNameStartIndex := startIndex
var b byte
Expand All @@ -149,12 +151,22 @@ func tokenizeV3(str string) []string {
start = idx
}
if idx == lastIndex && ((lastIndex-start+1) >= MinWordSize || isDotCountMode) {
result = append(result, str[start:strLen])
if !isContainingNumber {
result = append(result, str[start:strLen])
}
isContainingNumber = false
}
} else if b >= '0' && b <= '9' && !isDotCountMode {
isContainingNumber = true
} else if ((idx-start) >= MinWordSize || isDotCountMode) && start > -1 {
result = append(result, str[start:idx])
if !isContainingNumber {
result = append(result, str[start:idx])
}

isContainingNumber = false
start = -1
} else {
isContainingNumber = false
start = -1
}
if b == '/' && isDotCountMode {
Expand Down
33 changes: 22 additions & 11 deletions tokenizer/tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,25 @@ func init() {
}
func Test_tokenizeCorrectPath(t *testing.T) {
path := "/some-thing/very/interesting?queryparam2=1&queryparam2=3"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
}

func Test_tokenizePathWithDashes(t *testing.T) {
path := "/some-thing/very/interesting"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result)
}

func Test_tokenizePathWithDashes2(t *testing.T) {
path := "/hsv-fussball"
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{"hsv", "fussball"}, result)
}

func Test_tokenizeEmptyString(t *testing.T) {
path := ""
result := tokenizeV2(path)
result := tokenizeV3(path)
assert.ElementsMatch(t, []string{}, result)
}

Expand All @@ -47,36 +47,35 @@ func Test_filterStopWorlds(t *testing.T) {
}

func Test_URLTokenizer(t *testing.T) {
result := TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1")
result := TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1")
assert.ElementsMatch(t, []string{"path", "sport", "hsv", "fussball", "example.com"}, result)
}

func Test_URLTokenizerOneWord(t *testing.T) {
result := TokenizeV2("http://example.com/sport")
result := TokenizeV3("http://example.com/sport")
assert.ElementsMatch(t, []string{"example.com", "sport"}, result)
}

func Test_URLTokenizerOneWordMinSize(t *testing.T) {

result := TokenizeV2("http://www.test-page.de/aaa/bbb/bc/ccc")
result := TokenizeV3("http://www.test-page.de/aaa/bbb/bc/ccc")
assert.ElementsMatch(t, []string{"www.test-page.de", "aaa", "bbb", "ccc"}, result)
}

func Test_URLTokenizerWithScapedChars(t *testing.T) {
result := TokenizeV2("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
result := TokenizeV3("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord)
assert.ElementsMatch(t, []string{"emetriq", "com", "example.com"}, result)
}

func Test_URLTokenizerWithWrongEscapedChars(t *testing.T) {
result := TokenizeV2("http://example.com/%%ssomething/usefull")
result := TokenizeV3("http://example.com/%%ssomething/usefull")
assert.Equal(t, []string{"ssomething", "usefull", "example.com"}, result)
}
func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) {
DefaultStopWordFunc = IsGermanStopWord
result := TokenizeV2("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
result := TokenizeV3("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&amp_tf=Von %1$s")
assert.Equal(t, []string{
"vermischtes",
"article",
"marisa",
"burger",
"rosenheim",
Expand Down Expand Up @@ -116,6 +115,18 @@ func Test_URLWithoutHTTPButWithPath(t *testing.T) {
assert.ElementsMatch(t, []string{"sports", "www.ironsrc.com"}, result)
}

func Test_SkipWordsWithNumbers(t *testing.T) {
result := TokenizeV3("https://www.autoscout24.at/angebote/seat-altea-xl-reference-1-4-tfsi-motorschaden-benzin-grau-b82ebced-cb95-4f49-8038-5eb1c098e652")
// no 'ebced'
assert.ElementsMatch(t, []string{"angebote", "seat", "altea", "reference", "tfsi", "motorschaden", "benzin", "grau", "www.autoscout24.at"}, result)
assert.NotContains(t, result, "ebced")

result = TokenizeV3("https://www.coches.net/123nissan-interstar-25dci-120-pro-l2h2-3500-diesel-2009-en-barcelona-52386149-fuvivo.aspx1")
// no '123nissan', 'dci' and 'aspx1'
assert.ElementsMatch(t, []string{"interstar", "pro", "diesel", "barcelona", "fuvivo", "www.coches.net"}, result)
assert.NotContains(t, result, "dci")
}

func BenchmarkEscapedURLTokenizerV3(b *testing.B) {
for n := 0; n < b.N; n++ {
TokenizeV3("http://example.com/path/sport/hsv-fussball?bla=1&escaped=%2C%2C%3A%3A%3B%3B")
Expand Down

0 comments on commit 89b982a

Please sign in to comment.