From 2356fb8329ad7b68840880afe2b1eb50ad67e14a Mon Sep 17 00:00:00 2001 From: Christoph Dieck Date: Thu, 23 Dec 2021 09:36:09 +0100 Subject: [PATCH] feat: Initial commit --- .github/dependabot.yml | 7 + .github/workflows/ci.yaml | 31 ++ .github/workflows/release.yaml | 40 ++ README.md | 39 ++ go.mod | 11 + go.sum | 10 + tokenizer/tokenizer.go | 932 +++++++++++++++++++++++++++++++++ tokenizer/tokenizer_test.go | 151 ++++++ 8 files changed, 1221 insertions(+) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/ci.yaml create mode 100644 .github/workflows/release.yaml create mode 100644 go.mod create mode 100644 go.sum create mode 100644 tokenizer/tokenizer.go create mode 100644 tokenizer/tokenizer_test.go diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4b68edd --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# Use https://dependabot.com/docs/config-file/validator/ to check for errors. +version: 2 +updates: + - package-ecosystem: "gomod" + directory: "/" + schedule: + interval: "weekly" \ No newline at end of file diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..42a01d9 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,31 @@ +on: [push, pull_request] +name: CI +jobs: + test: + strategy: + matrix: + go-version: [1.13.x, 1.14.x, 1.15.x] + platform: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.platform }} + steps: + - name: Install Go + uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go-version }} + - name: Checkout code + uses: actions/checkout@v2 + - name: Test + run: go test -race -covermode atomic -coverprofile profile.cov ./... + - name: Send coverage + uses: shogo82148/actions-goveralls@v1 + with: + path-to-profile: profile.cov + flag-name: Go-${{ matrix.go-version }} + parallel: true + finish: + needs: test + runs-on: ubuntu-latest + steps: + - uses: shogo82148/actions-goveralls@v1 + with: + parallel-finished: true \ No newline at end of file diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..b639b18 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,40 @@ +name: RELEASE + +on: + push: + branches: + - main +jobs: + test: + strategy: + matrix: + go-version: [1.13.x, 1.14.x, 1.15.x] + platform: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.platform }} + steps: + - name: Install Go + uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go-version }} + - name: Checkout code + uses: actions/checkout@v2 + - name: Test + run: go test -race -covermode atomic -coverprofile profile.cov ./... + - name: Send coverage + uses: shogo82148/actions-goveralls@v1 + with: + path-to-profile: profile.cov + flag-name: Go-${{ matrix.go-version }} + parallel: true + finish: + needs: test + runs-on: ubuntu-latest + steps: + - uses: shogo82148/actions-goveralls@v1 + with: + parallel-finished: true + - uses: go-semantic-release/action@v1 + id: semrel + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + changelog-generator-opt: "emojis=true" \ No newline at end of file diff --git a/README.md b/README.md index 25c83d6..90be001 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,41 @@ # gourltokenizer + A powerful URL tokenizer + +# install + +`go get github.com/emetriq/gourltokenizer` + +# usage + +```golang +import ( + tok "github.com/emetriq/gourltokenizer" +) +// set min token size +tok.MinWordSize = 3 +// set default stop words +tok.DefaultStopWordFunc = IsGermanStopWord + +reuslt := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a") +// custom stop words +reuslt2 := tok.TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a", func(val string) bool { + if val == "fussbal" { + return true + } + if val == "Subdomain" { + return true + } + return false + }) +``` +# Benchmark Results + +goos: darwin +goarch: amd64 +cpu: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz +|Benchmark|runs|time/op|B/op|allocs/op| +|---|---|---|---|---| +BenchmarkURLTokenizerV2-12|2026138|605.3 ns/op|256 B/op|1 allocs/op +BenchmarkURLTokenizerV2Fast-12|3609961|330.6 ns/op|256 B/op|1 allocs/op +BenchmarkURLTokenizerV1-12|1766235|676.3 ns/op|272 B/op|2 allocs/op \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..009e727 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module github.com/emetriq/gourltokenizer + +go 1.17 + +require github.com/stretchr/testify v1.7.0 + +require ( + github.com/davecgh/go-spew v1.1.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..b380ae4 --- /dev/null +++ b/go.sum @@ -0,0 +1,10 @@ +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go new file mode 100644 index 0000000..0a2fd47 --- /dev/null +++ b/tokenizer/tokenizer.go @@ -0,0 +1,932 @@ +package tokenizer + +import ( + "fmt" + "net/url" + "strings" +) + +var MinWordSize = 3 + +var DefaultStopWordFunc = IsEnglishStopWord + +func isRuneAllowed(r rune, isDotCountMode bool) bool { + if r >= 'a' && r <= 'z' { + return true + } + return isDotCountMode && r != '.' && r != '/' +} + +//TokenizeV2 splits URL to host and path parts and tokenize path and host part +//all terms are returned in lower case +func TokenizeV2(encodedURL string, stopwordfunc ...func(string) bool) []string { + encodedURLLower := strings.ToLower(encodedURL) + decodedURL, err := url.QueryUnescape(encodedURLLower) + if err != nil { + escapedEncodedURL := url.QueryEscape(encodedURL) + decodedURL, err = url.QueryUnescape(escapedEncodedURL) + } + + if err != nil { + return []string{} + } + + result := filterStopWords(tokenizeV2(decodedURL), stopwordfunc...) + + return result +} + +//TokenizeFastV2 splits URL to host and path parts and tokenize path and host part without url decoding +//all terms are returned in lower case +func TokenizeFastV2(encodedURL string, stopwordfunc ...func(string) bool) []string { + urlLower := strings.ToLower(encodedURL) + result := tokenizeV2(urlLower) + if len(stopwordfunc) > 0 { + result = filterStopWords(result, stopwordfunc[0]) + } + return result +} + +//TokenizeURL splits URL to host and path parts and tokenize path part +//all terms are returned in lower case +func TokenizeV1(url string, stopwordfunc ...func(string) bool) []string { + urlToParse := url + if !strings.HasPrefix(url, "http") && !strings.HasPrefix(url, "mailto") { + urlToParse = fmt.Sprintf("http://%s", urlToParse) + } + urlLower := strings.ToLower(urlToParse) + host, path, err := parseURL(urlLower) + if err != nil { + return []string{} + } + + result := filterStopWords(tokenizeV1(path), stopwordfunc...) + + return append(result, host) +} + +func tokenizeV2(str string) []string { + // remove protocol + startIndex := strings.Index(str, "://") + if startIndex < 7 && startIndex > 0 && len(str) > startIndex+3 { + startIndex = startIndex + 3 + } else { + startIndex = 0 + } + + strLen := len(str) + lastIndex := strLen - 1 + result := make([]string, 0, strLen/MinWordSize) + start := -1 + dotCounter := 0 + isDotCountMode := true + domainNameEndIndex := -1 + domainNameStartIndex := startIndex + for idx, r := range str { + if idx < startIndex { + continue + } + + if isRuneAllowed(r, isDotCountMode) { + if start == -1 { + start = idx + } + if idx == lastIndex && ((lastIndex-start+1) >= MinWordSize || isDotCountMode) { + result = append(result, str[start:strLen]) + } + } else if ((idx-start) >= MinWordSize || isDotCountMode) && start > -1 { + result = append(result, str[start:idx]) + start = -1 + } else { + start = -1 + } + if r == '/' && isDotCountMode { + isDotCountMode = false + domainNameEndIndex = idx + dotCounter = len(result) - 1 + } + + if r == '?' { // skip query params + break + } + } + + if isDotCountMode { + dotCounter = len(result) - 1 + domainNameEndIndex = len(str) + } + + if dotCounter > 0 && len(result) > 1 { + result = append(result[:(dotCounter-1)], result[dotCounter+1:]...) + if domainNameEndIndex-domainNameStartIndex > 3 { // if domain name is longer than 3 chars + for len(str) > domainNameStartIndex && str[domainNameStartIndex] == '.' { + domainNameStartIndex++ + } + result = append(result, str[domainNameStartIndex:domainNameEndIndex]) + } + } + return result +} + +func tokenizeV1(str string) []string { + strLen := len(str) + lastIndex := strLen - 1 + result := make([]string, 0, strLen/MinWordSize) + start := -1 + + for idx, r := range str { + if isRuneAllowed(r, false) { + if start == -1 { + start = idx + } + if idx == lastIndex && (lastIndex-start+1) >= MinWordSize { + result = append(result, str[start:strLen]) + } + } else if (idx-start) >= MinWordSize && start > -1 { + result = append(result, str[start:idx]) + start = -1 + } else { + start = -1 + } + } + return result +} + +func parseURL(str string) (host string, path string, err error) { + u, err := url.Parse(str) + if err != nil { + return "", "", err + } + return u.Host, u.Path, nil +} + +func filterStopWords(terms []string, stopwordfunc ...func(string) bool) []string { + filter := DefaultStopWordFunc + if len(stopwordfunc) > 0 { + filter = stopwordfunc[0] + } else if filter == nil { + return terms + } + + for i := 0; len(terms) > i; i++ { + if filter(terms[i]) || filter(terms[i][1:]) { + terms = append(terms[:i], terms[i+1:]...) + i-- + } + } + return terms +} + +//IsStopWord returns true if word is stop word +func IsGermanStopWord(word string) bool { + switch word { + case "www": + return true + case "jenen": + return true + case "manchem": + return true + case "euren": + return true + case "ihres": + return true + case "war": + return true + case "meinem": + return true + case "jeden": + return true + case "thirdparty": + return true + case "bei": + return true + case "manchen": + return true + case "ander": + return true + case "solchem": + return true + case "habe": + return true + case "koennen": + return true + case "den": + return true + case "anders": + return true + case "muss": + return true + case "haben": + return true + case "demselben": + return true + case "aus": + return true + case "in": + return true + case "allen": + return true + case "keinem": + return true + case "während": + return true + case "eurer": + return true + case "derer": + return true + case "anderem": + return true + case "nichts": + return true + case "instantarticles": + return true + case "jeder": + return true + case "ein": + return true + case "eine": + return true + case "solches": + return true + case "von": + return true + case "denselben": + return true + case "andere": + return true + case "indem": + return true + case "eurem": + return true + case "selbst": + return true + case "zum": + return true + case "poid": + return true + case "getrenderedemetriqcontent": + return true + case "auch": + return true + case "keinen": + return true + case "alle": + return true + case "cms": + return true + case "htm": + return true + case "welchen": + return true + case "deines": + return true + case "anderr": + return true + case "derselben": + return true + case "sollte": + return true + case "könnte": + return true + case "wirst": + return true + case "eures": + return true + case "fuer": + return true + case "meinen": + return true + case "wo": + return true + case "ihrer": + return true + case "man": + return true + case "dazu": + return true + case "der": + return true + case "euer": + return true + case "will": + return true + case "sehr": + return true + case "ob": + return true + case "dem": + return true + case "ins": + return true + case "aber": + return true + case "einen": + return true + case "sonst": + return true + case "was": + return true + case "manche": + return true + case "static": + return true + case "im": + return true + case "weiter": + return true + case "eines": + return true + case "mancher": + return true + case "wir": + return true + case "würden": + return true + case "derselbe": + return true + case "deinem": + return true + case "wie": + return true + case "wieder": + return true + case "seine": + return true + case "mich": + return true + case "hatten": + return true + case "hatte": + return true + case "jener": + return true + case "daß": + return true + case "embedded": + return true + case "und": + return true + case "seinen": + return true + case "uns": + return true + case "forum": + return true + case "thread": + return true + case "jedem": + return true + case "meiner": + return true + case "über": + return true + case "jetzt": + return true + case "diese": + return true + case "ich": + return true + case "keines": + return true + case "aller": + return true + case "durch": + return true + case "meine": + return true + case "damit": + return true + case "weg": + return true + case "sondern": + return true + case "unseren": + return true + case "wollte": + return true + case "widget": + return true + case "anderm": + return true + case "wuerden": + return true + case "als": + return true + case "unserem": + return true + case "da": + return true + case "ueber": + return true + case "waehrend": + return true + case "koennte": + return true + case "zwar": + return true + case "hab": + return true + case "wuerde": + return true + case "anderes": + return true + case "so": + return true + case "lightbox": + return true + case "dieselben": + return true + case "dein": + return true + case "diesen": + return true + case "alles": + return true + case "wollen": + return true + case "zur": + return true + case "kein": + return true + case "etwas": + return true + case "mit": + return true + case "an": + return true + case "jedes": + return true + case "deine": + return true + case "oder": + return true + case "dort": + return true + case "bis": + return true + case "einiges": + return true + case "kann": + return true + case "waren": + return true + case "hin": + return true + case "das": + return true + case "wenn": + return true + case "php": + return true + case "dies": + return true + case "ihre": + return true + case "euch": + return true + case "unter": + return true + case "anderer": + return true + case "solchen": + return true + case "für": + return true + case "jenem": + return true + case "hinter": + return true + case "welcher": + return true + case "dieses": + return true + case "wird": + return true + case "pid": + return true + case "doch": + return true + case "dieselbe": + return true + case "werde": + return true + case "noch": + return true + case "ihren": + return true + case "machen": + return true + case "jenes": + return true + case "einige": + return true + case "einigen": + return true + case "welchem": + return true + case "ist": + return true + case "jene": + return true + case "um": + return true + case "ihnen": + return true + case "html": + return true + case "jede": + return true + case "du": + return true + case "es": + return true + case "zwischen": + return true + case "einer": + return true + case "nach": + return true + case "anderen": + return true + case "dass": + return true + case "jsp": + return true + case "seinem": + return true + case "manches": + return true + case "unsere": + return true + case "gegen": + return true + case "iframe": + return true + case "https": + return true + case "ihrem": + return true + case "weil": + return true + case "ihn": + return true + case "werden": + return true + case "andern": + return true + case "keine": + return true + case "desselben": + return true + case "viel": + return true + case "downloads": + return true + case "bin": + return true + case "deinen": + return true + case "hat": + return true + case "gewesen": + return true + case "nicht": + return true + case "diesem": + return true + case "ohne": + return true + case "welches": + return true + case "einigem": + return true + case "dann": + return true + case "einig": + return true + case "tid": + return true + case "zu": + return true + case "einmal": + return true + case "seines": + return true + case "er": + return true + case "mir": + return true + case "auf": + return true + case "dessen": + return true + case "sid": + return true + case "mein": + return true + case "seiner": + return true + case "musste": + return true + case "nur": + return true + case "einiger": + return true + case "nun": + return true + case "dich": + return true + case "stats": + return true + case "deiner": + return true + case "welche": + return true + case "unseres": + return true + case "am": + return true + case "warst": + return true + case "bist": + return true + case "würde": + return true + case "solche": + return true + case "einem": + return true + case "denn": + return true + case "diff": + return true + case "also": + return true + case "sie": + return true + case "hier": + return true + case "ihr": + return true + case "vor": + return true + case "des": + return true + case "allem": + return true + case "keiner": + return true + case "unser": + return true + case "titel": + return true + case "sein": + return true + case "vom": + return true + case "widgets": + return true + case "dieser": + return true + case "sind": + return true + case "meines": + return true + case "dir": + return true + case "eure": + return true + case "archiv": + return true + case "ihm": + return true + case "solcher": + return true + case "die": + return true + case "dasselbe": + return true + case "können": + return true + case "sich": + return true + case "http": + return true + case "soll": + return true + default: + return false + } +} + +//IsEnglishStopWord returns true if word is stop word +func IsEnglishStopWord(word string) bool { + switch word { + case "www": + return true + case "myself": + return true + case "our": + return true + case "ours": + return true + case "ourselves": + return true + case "you": + return true + case "your": + return true + case "yours": + return true + case "yourself": + return true + case "yourselves": + return true + case "him": + return true + case "his": + return true + case "himself": + return true + case "she": + return true + case "her": + return true + case "hers": + return true + case "herself": + return true + case "its": + return true + case "itself": + return true + case "they": + return true + case "them": + return true + case "their": + return true + case "theirs": + return true + case "themselves": + return true + case "what": + return true + case "which": + return true + case "who": + return true + case "whom": + return true + case "this": + return true + case "that": + return true + case "these": + return true + case "those": + return true + case "are": + return true + case "was": + return true + case "were": + return true + case "been": + return true + case "being": + return true + case "have": + return true + case "has": + return true + case "had": + return true + case "having": + return true + case "does": + return true + case "did": + return true + case "doing": + return true + case "the": + return true + case "and": + return true + case "but": + return true + case "because": + return true + case "until": + return true + case "while": + return true + case "for": + return true + case "with": + return true + case "about": + return true + case "against": + return true + case "between": + return true + case "into": + return true + case "through": + return true + case "during": + return true + case "before": + return true + case "after": + return true + case "above": + return true + case "below": + return true + case "from": + return true + case "down": + return true + case "out": + return true + case "off": + return true + case "over": + return true + case "under": + return true + case "again": + return true + case "further": + return true + case "then": + return true + case "once": + return true + case "here": + return true + case "there": + return true + case "when": + return true + case "where": + return true + case "why": + return true + case "how": + return true + case "all": + return true + case "any": + return true + case "both": + return true + case "each": + return true + case "few": + return true + case "more": + return true + case "most": + return true + case "other": + return true + case "some": + return true + case "such": + return true + case "nor": + return true + case "not": + return true + case "only": + return true + case "own": + return true + case "same": + return true + case "than": + return true + case "too": + return true + case "very": + return true + case "can": + return true + case "will": + return true + case "just": + return true + case "don": + return true + case "should": + return true + case "now": + return true + default: + return false + } +} diff --git a/tokenizer/tokenizer_test.go b/tokenizer/tokenizer_test.go new file mode 100644 index 0000000..cab2653 --- /dev/null +++ b/tokenizer/tokenizer_test.go @@ -0,0 +1,151 @@ +package tokenizer + +import ( + "strconv" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_tokenizeCorrectPath(t *testing.T) { + path := "/some-thing/very/interesting?queryparam2=1&queryparam2=3" + result := tokenizeV2(path) + assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result) +} + +func Test_tokenizePathWithDashes(t *testing.T) { + path := "/some-thing/very/interesting" + result := tokenizeV2(path) + assert.ElementsMatch(t, []string{"some", "thing", "very", "interesting"}, result) +} + +func Test_tokenizePathWithDashes2(t *testing.T) { + path := "/hsv-fussball" + result := tokenizeV2(path) + assert.ElementsMatch(t, []string{"hsv", "fussball"}, result) +} + +func Test_tokenizeEmptyString(t *testing.T) { + path := "" + result := tokenizeV2(path) + assert.ElementsMatch(t, []string{}, result) +} + +func Test_filterStopWorlds(t *testing.T) { + result := filterStopWords([]string{"hallo", "cms", "titel", "welt"}, func(val string) bool { + if val == "cms" { + return true + } + if val == "titel" { + return true + } + return false + }) + assert.ElementsMatch(t, []string{"hallo", "welt"}, result) +} + +func Test_URLTokenizer(t *testing.T) { + result := TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1") + assert.ElementsMatch(t, []string{"path", "sport", "hsv", "fussball", "example.com"}, result) +} + +func Test_URLTokenizerOneWord(t *testing.T) { + result := TokenizeV2("http://example.com/sport") + assert.ElementsMatch(t, []string{"example.com", "sport"}, result) +} + +func Test_URLTokenizerOneWordMinSize(t *testing.T) { + result := TokenizeV2("http://www.test-page.de/aaa/bbb/bc/ccc") + assert.ElementsMatch(t, []string{"www.test-page.de", "aaa", "bbb", "ccc"}, result) +} + +func Test_URLTokenizerWithScapedChars(t *testing.T) { + result := TokenizeV2("http://example.com/%3ahttps%3A%2F%2Fwww.emetriq.com%2F", IsGermanStopWord) + assert.ElementsMatch(t, []string{"emetriq", "com", "example.com"}, result) +} + +func Test_URLTokenizerWithWrongEscapedChars(t *testing.T) { + result := TokenizeV2("http://example.com/%%ssomething/usefull") + assert.Equal(t, []string{"ssomething", "usefull", "example.com"}, result) +} +func Test_URLTokenizerWithWrongEscapedChars2(t *testing.T) { + DefaultStopWordFunc = IsGermanStopWord + result := TokenizeV2("https://www.morgenpost.de/vermischtes/article233484549/marisa-burger-rosenheim-cops-schaupielerin.html?service=amp#aoh=16333619698076&csi=0&referrer=https://www.google.com&_tf=Von %1$s") + assert.Equal(t, []string{ + "vermischtes", + "article", + "marisa", + "burger", + "rosenheim", + "cops", + "schaupielerin", + "www.morgenpost.de", + }, result) +} +func Test_URLTokenizerWithWrongHostEscapedChars(t *testing.T) { + result := TokenizeV2("http://..example.com/something") + assert.Equal(t, []string{"something", "example.com"}, result) +} + +func Test_URLTokenizerWithCapitalChars(t *testing.T) { + DefaultStopWordFunc = IsGermanStopWord + result := TokenizeV2("mailto://www.Subdomain.example.com/HSV-fussbal%3asome/a") + assert.ElementsMatch(t, []string{"subdomain", "hsv", "fussbal", "some", "www.subdomain.example.com"}, result) +} + +func Test_URLWithoutHTTP(t *testing.T) { + result := TokenizeV2("www.Subdomain.example.com") + assert.ElementsMatch(t, []string{"subdomain", "www.subdomain.example.com"}, result) +} + +func Test_URLWithoutHTTPAndWithoutSubdomain(t *testing.T) { + result := TokenizeV2("www.example.com") + assert.ElementsMatch(t, []string{"www.example.com"}, result) +} + +func Test_URLWithoutHTTPAndSubdomain(t *testing.T) { + result := TokenizeV2("sport.fussball.example.com") + assert.ElementsMatch(t, []string{"sport", "fussball", "sport.fussball.example.com"}, result) +} + +func Test_URLWithoutHTTPButWithPath(t *testing.T) { + result := TokenizeV2("www.ironsrc.com/sports") + assert.ElementsMatch(t, []string{"sports", "www.ironsrc.com"}, result) +} + +func BenchmarkURLTokenizerV2(b *testing.B) { + for n := 0; n < b.N; n++ { + TokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1") + } +} + +func BenchmarkURLTokenizerV2Fast(b *testing.B) { + for n := 0; n < b.N; n++ { + TokenizeFastV2("http://example.com/path/sport/hsv-fussball?bla=1") + } +} + +func BenchmarkURLTokenizerV1(b *testing.B) { + for n := 0; n < b.N; n++ { + TokenizeV1("http://example.com/path/sport/hsv-fussball?bla=1") + } +} + +func createSlices(size int) []string { + wordCombo := make([]string, size) + for i := 0; i < size; i++ { + wordCombo[i] = "word" + strconv.Itoa(i) + } + return wordCombo +} + +func BenchmarkTokenizerV1(b *testing.B) { + for n := 0; n < b.N; n++ { + tokenizeV1("http://example.com/path/sport/hsv-fussball?bla=1") + } +} +func BenchmarkTokenizerV2(b *testing.B) { + for n := 0; n < b.N; n++ { + tokenizeV2("http://example.com/path/sport/hsv-fussball?bla=1") + } +}