-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2be1005
commit 182a24f
Showing
4 changed files
with
131 additions
and
2 deletions.
There are no files selected for viewing
Empty file.
70 changes: 70 additions & 0 deletions
70
src/pytargetingutilities/urltokenizer/regex_url_tokenizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
|
||
import re | ||
from typing import List | ||
from urllib.parse import urlparse | ||
|
||
|
||
def tokens(url: str, min_length=3) -> List[str]: | ||
""" Tokenize a URL using regex. | ||
:param url: URL to tokenize | ||
:param min_length: minimum length of tokens | ||
:return: list of tokens | ||
""" | ||
url = url.lower() | ||
parsed = urlparse(url) | ||
token_list = [parsed.netloc] + [ | ||
token | ||
for token in re.split( | ||
r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\'\d]', parsed.path | ||
) | ||
if token != "" | ||
] | ||
return list(filter(lambda x: len(x) >= min_length, token_list)) | ||
|
||
|
||
def relevant_tokens( | ||
urls: List[str], | ||
compare_func=lambda x, y: x == y, | ||
min_length=3, | ||
not_allowed=[], | ||
sort_key=lambda x: (x[1], len(x[0]), x[0]), | ||
) -> List[str]: | ||
""" | ||
Finds relevant tokens in a list of urls for matching purposes. | ||
:param urls: List of urls | ||
:param compare_func: Function to compare two tokens (default: ==) | ||
:param min_length: Minimum length of a token (default: 3) | ||
:param not_allowed: List of tokens that | ||
:param sort_key: Function to sort the tokens (default: sort by occurrence, | ||
then by token length, then by token) | ||
should not be considered (default: []) | ||
:return: List of relevant tokens | ||
""" | ||
urls_tokens = list(map(lambda x: tokens(x, min_length), urls)) | ||
all_tokens = [token for url_tokens in urls_tokens for token in url_tokens] | ||
sorted_tokens = sorted( | ||
[(x, all_tokens.count(x)) for x in set(all_tokens)], | ||
key=sort_key, | ||
reverse=True, | ||
) | ||
relevant_tokens = list(map(lambda x: x[0], sorted_tokens)) | ||
result = set() | ||
for url_tokens in urls_tokens: | ||
for token in relevant_tokens: | ||
for url_token in url_tokens: | ||
if ( | ||
compare_func( | ||
token, url_token | ||
) # check token against url_token token | ||
and token not in not_allowed | ||
# check token against not_allowed tokens | ||
and not any( | ||
[val in url_tokens for val in result] | ||
) # check token against already found tokens | ||
): | ||
result.add(token) | ||
break | ||
else: | ||
continue # Continue if the inner loop wasn't broken. | ||
break # Inner loop was broken, break the outer | ||
return list(result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import unittest | ||
from pytargetingutilities.urltokenizer.regex_url_tokenizer import ( | ||
relevant_tokens, | ||
tokens, | ||
) | ||
|
||
|
||
class TestRegexURLTokens(unittest.TestCase): | ||
def test_local_tokenizer(self): | ||
url_tokens = tokens('http://ebay.de/autos/luxus/de') | ||
assert sorted(url_tokens) == sorted(['ebay.de', 'autos', 'luxus']) | ||
|
||
def test_local_tokenizer_with_min_length(self): | ||
url_tokens = tokens( | ||
'http://google.com/autos/luxus/deutschland', min_length=6 | ||
) | ||
assert sorted(url_tokens) == sorted(['google.com', 'deutschland']) | ||
|
||
def test_relevant_token_eq(self): | ||
urls = [ | ||
'http://test.com/autos', | ||
'http://test1.com/autos', | ||
'http://test2.com/autos', | ||
] | ||
assert relevant_tokens(urls) == ['autos'] | ||
|
||
urls = [ | ||
'http://test.com/autos', | ||
'http://test1.com/autos', | ||
'http://test2.com/autos', | ||
'http://test.com/games/spiele/starcraft', | ||
'http://test1.com/games/spiele', | ||
'http://test2.com/autos/spiele', | ||
] | ||
assert relevant_tokens(urls) == ['autos', 'spiele'] | ||
|
||
def test_relevant_tokens_in(self): | ||
urls = [ | ||
'http://test.com/auto', | ||
'http://test1.com/auto', | ||
'http://test2.com/auto', | ||
'http://test3.com/autos', | ||
] | ||
assert relevant_tokens(urls, lambda x, y: x in y) == ['auto'] | ||
|
||
def test_relevant_tokens_exclude(self): | ||
urls1 = [ | ||
'https://www.autobild.de/artikel/neue-zoe-elektroautos-bis-2024--5777435', | ||
'https://www.e-autos.de/reanault-zoe-elektroautos', | ||
] | ||
urls2 = [ | ||
'https://teslamag.de/news/luxus-haeuser-tesla-technik-siedlung-florida', | ||
'https://teslamag.de/news/luxus-eautos-teuer', | ||
] | ||
|
||
tokens = relevant_tokens(urls1) | ||
assert tokens == ['elektroautos'] | ||
tokens = relevant_tokens(urls2, not_allowed=['luxus', 'teslamag.de']) | ||
assert tokens == ['news'] |