Skip to content

Commit

Permalink
feat: added regex tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
SlashGordon committed Jan 31, 2022
1 parent 2be1005 commit 182a24f
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 2 deletions.
Empty file.
70 changes: 70 additions & 0 deletions src/pytargetingutilities/urltokenizer/regex_url_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

import re
from typing import List
from urllib.parse import urlparse


def tokens(url: str, min_length=3) -> List[str]:
""" Tokenize a URL using regex.
:param url: URL to tokenize
:param min_length: minimum length of tokens
:return: list of tokens
"""
url = url.lower()
parsed = urlparse(url)
token_list = [parsed.netloc] + [
token
for token in re.split(
r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\'\d]', parsed.path
)
if token != ""
]
return list(filter(lambda x: len(x) >= min_length, token_list))


def relevant_tokens(
urls: List[str],
compare_func=lambda x, y: x == y,
min_length=3,
not_allowed=[],
sort_key=lambda x: (x[1], len(x[0]), x[0]),
) -> List[str]:
"""
Finds relevant tokens in a list of urls for matching purposes.
:param urls: List of urls
:param compare_func: Function to compare two tokens (default: ==)
:param min_length: Minimum length of a token (default: 3)
:param not_allowed: List of tokens that
:param sort_key: Function to sort the tokens (default: sort by occurrence,
then by token length, then by token)
should not be considered (default: [])
:return: List of relevant tokens
"""
urls_tokens = list(map(lambda x: tokens(x, min_length), urls))
all_tokens = [token for url_tokens in urls_tokens for token in url_tokens]
sorted_tokens = sorted(
[(x, all_tokens.count(x)) for x in set(all_tokens)],
key=sort_key,
reverse=True,
)
relevant_tokens = list(map(lambda x: x[0], sorted_tokens))
result = set()
for url_tokens in urls_tokens:
for token in relevant_tokens:
for url_token in url_tokens:
if (
compare_func(
token, url_token
) # check token against url_token token
and token not in not_allowed
# check token against not_allowed tokens
and not any(
[val in url_tokens for val in result]
) # check token against already found tokens
):
result.add(token)
break
else:
continue # Continue if the inner loop wasn't broken.
break # Inner loop was broken, break the outer
return list(result)
4 changes: 2 additions & 2 deletions test/test_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ def test_md5(self):
self.assertIsNotNone(logger)
try:
logger.info('test')
except:
except Exception:
self.fail("logger.info raised Exception unexpectedly!")

logger = setup_logging_graylog("test", logging.INFO, True, "", 0)
self.assertIsNotNone(logger)
try:
logger.info('test')
except:
except Exception:
self.fail("logger.info raised Exception unexpectedly!")
59 changes: 59 additions & 0 deletions test/test_regex_url_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import unittest
from pytargetingutilities.urltokenizer.regex_url_tokenizer import (
relevant_tokens,
tokens,
)


class TestRegexURLTokens(unittest.TestCase):
def test_local_tokenizer(self):
url_tokens = tokens('http://ebay.de/autos/luxus/de')
assert sorted(url_tokens) == sorted(['ebay.de', 'autos', 'luxus'])

def test_local_tokenizer_with_min_length(self):
url_tokens = tokens(
'http://google.com/autos/luxus/deutschland', min_length=6
)
assert sorted(url_tokens) == sorted(['google.com', 'deutschland'])

def test_relevant_token_eq(self):
urls = [
'http://test.com/autos',
'http://test1.com/autos',
'http://test2.com/autos',
]
assert relevant_tokens(urls) == ['autos']

urls = [
'http://test.com/autos',
'http://test1.com/autos',
'http://test2.com/autos',
'http://test.com/games/spiele/starcraft',
'http://test1.com/games/spiele',
'http://test2.com/autos/spiele',
]
assert relevant_tokens(urls) == ['autos', 'spiele']

def test_relevant_tokens_in(self):
urls = [
'http://test.com/auto',
'http://test1.com/auto',
'http://test2.com/auto',
'http://test3.com/autos',
]
assert relevant_tokens(urls, lambda x, y: x in y) == ['auto']

def test_relevant_tokens_exclude(self):
urls1 = [
'https://www.autobild.de/artikel/neue-zoe-elektroautos-bis-2024--5777435',
'https://www.e-autos.de/reanault-zoe-elektroautos',
]
urls2 = [
'https://teslamag.de/news/luxus-haeuser-tesla-technik-siedlung-florida',
'https://teslamag.de/news/luxus-eautos-teuer',
]

tokens = relevant_tokens(urls1)
assert tokens == ['elektroautos']
tokens = relevant_tokens(urls2, not_allowed=['luxus', 'teslamag.de'])
assert tokens == ['news']

0 comments on commit 182a24f

Please sign in to comment.