diff --git a/.gitignore b/.gitignore index b60dec8..f457410 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ diff --git a/checks.sh b/checks.sh old mode 100755 new mode 100644 diff --git a/fastcrawler/exceptions.py b/fastcrawler/exceptions.py index 9284a8f..09a6a8d 100644 --- a/fastcrawler/exceptions.py +++ b/fastcrawler/exceptions.py @@ -26,6 +26,24 @@ def __init__(self, model): super().__init__(self.message) +class TaskNotFound(BaseModelError): + def __init__(self, task_name): + super().__init__( + f"The Task with name={task_name} has not been found", + "\nPlease check your input and be sure the task name is correct", + ) + + +class NoCrawlerFound(BaseModelError): + def __init__(self): + super().__init__( + "No task has been registered in the application." + "\nPlease make sure that you've assigned the crawlers to the application" + "so the application is aware of the crawlers." + "\nThis may also raise if you have overridden the library's startup built in method" + ) + + class ProcessorNotSupported(BaseModelError): def __init__(self, model): self.model = model diff --git a/fastcrawler/parsers/base.py b/fastcrawler/parsers/base.py index d0e0be0..6bf3b85 100644 --- a/fastcrawler/parsers/base.py +++ b/fastcrawler/parsers/base.py @@ -6,11 +6,9 @@ class ParserProtocol(Protocol): def __init__(self, scraped_data: Any): """Initilize the parser with the given data (html/json/etc)""" - ... def parse(self, model: Any) -> Any: """ Parse the saved data, with given model, which should be a pydantic model imported from fastcrawler library """ - ... diff --git a/fastcrawler/parsers/selectors/base.py b/fastcrawler/parsers/selectors/base.py index 50318e9..29e478c 100644 --- a/fastcrawler/parsers/selectors/base.py +++ b/fastcrawler/parsers/selectors/base.py @@ -36,7 +36,7 @@ def __init__( def __repr__(self): """Represents a selector for debugging purposes""" - return f"Field(type={self.__class__.__name__} extract={self.extract}, " f"many={self.many}, query={self.query})" + return f"Field(type={self.__class__.__name__} extract={self.extract}, many={self.many}, query={self.query})" def resolve(self, scraped_data, model): """Must be implemented by outer classes. diff --git a/fastcrawler/parsers/selectors/css.py b/fastcrawler/parsers/selectors/css.py index 7243800..ac6fba1 100644 --- a/fastcrawler/parsers/selectors/css.py +++ b/fastcrawler/parsers/selectors/css.py @@ -7,9 +7,10 @@ from fastcrawler.parsers.utils import _UNSET from ..processors.base import ProcessorInterface +from .base import BaseSelector -class _CSSField: +class _CSSField(BaseSelector): """ CSSSelectorField represents a field that can be retrieved from a given HTML document using CSS selectors. diff --git a/fastcrawler/parsers/selectors/regex.py b/fastcrawler/parsers/selectors/regex.py index 33d9ae6..1994f91 100644 --- a/fastcrawler/parsers/selectors/regex.py +++ b/fastcrawler/parsers/selectors/regex.py @@ -6,8 +6,10 @@ from fastcrawler.parsers.pydantic import BaseModelType from fastcrawler.parsers.utils import _UNSET +from .base import BaseSelector -class _RegexField: + +class _RegexField(BaseSelector): """ RegexField represents a field that can be retrieved from a given HTML document using Regex. diff --git a/fastcrawler/parsers/selectors/xpath.py b/fastcrawler/parsers/selectors/xpath.py index 0f5244a..e40c491 100644 --- a/fastcrawler/parsers/selectors/xpath.py +++ b/fastcrawler/parsers/selectors/xpath.py @@ -7,9 +7,10 @@ from fastcrawler.parsers.utils import _UNSET from ..processors.base import ProcessorInterface +from .base import BaseSelector -class _XPATHField: +class _XPATHField(BaseSelector): """ XPATHField represents a field that can be retrieved from a given HTML document using XPath. diff --git a/fastcrawler/schedule/adopter.py b/fastcrawler/schedule/adopter.py new file mode 100644 index 0000000..1aa3852 --- /dev/null +++ b/fastcrawler/schedule/adopter.py @@ -0,0 +1,90 @@ +from typing import Callable + +from rocketry import Rocketry +from rocketry.conditions.api import cron + +from fastcrawler.exceptions import TaskNotFound + +from .schema import Task + + +class RocketryApplication: + def __init__(self, *args, **kwargs): + self.task_lib: Rocketry = Rocketry(*args, **kwargs) + + async def serve(self, *args, **kwargs): # pragma: no cover + """Proto to serve with uvicorn""" + await self.start_up() + return await self.task_lib.serve(*args, **kwargs) + + async def get_all_tasks(self) -> set[Task]: + return self.task_lib.session.tasks + + async def add_task(self, task_func: Callable, settings: Task) -> None: + """ + ... + """ + self.task_lib.task(**dict(settings))(task_func) + return None + + async def start_up(self) -> None: + """ + Run Startup Event + """ + + async def shut_down(self) -> None: + self.task_lib.session.shut_down() + return None + + +class RocketryManager: + def __init__(self, app: RocketryApplication): + self.app = app + + async def all(self) -> set[Task]: + """ + Return all tasks from internal + """ + return await self.app.get_all_tasks() + + async def add_task(self, task_func: Callable, settings: Task) -> None: + """ + Add tasks within internal python API + """ + await self.app.add_task(task_func, settings) + return None + + async def change_task_schedule( + self, + task_name: str, + schedule: str, + ) -> None: + """ + Reschedule a task + schedule: + - can be string + `every 2 seconds` + - can be cron + `*/2 * * * *` + """ + for task in await self.app.get_all_tasks(): + if task.name == task_name: + if schedule.count(" ") == 4: + task.start_cond = cron(schedule) + else: + task.start_cond = schedule + return None + raise TaskNotFound(task_name) + + async def toggle_task(self, task_name: str) -> None: + """ + Disables or enable one task + """ + for task in await self.app.get_all_tasks(): + if task.name == task_name: + if task.disabled: + task.disabled = False + else: + task.disabled = True + return None + raise TaskNotFound(task_name) diff --git a/fastcrawler/schedule/proto.py b/fastcrawler/schedule/proto.py new file mode 100644 index 0000000..227dec5 --- /dev/null +++ b/fastcrawler/schedule/proto.py @@ -0,0 +1,61 @@ +from typing import Callable, Protocol + +from .schema import Task + + +class TaskApplicationProto(Protocol): # pragma: no cover + task_lib: Callable + + def __init__(self, *args, **kwargs): + """Initialize task application""" + + async def serve(self, *args, **kwargs): + """Proto to serve with Uvicorn""" + + async def get_all_tasks(self) -> set[Task]: + """Returns all tasks that exists in Fast Crawler""" + + async def add_task(self, *args, **kwargs) -> None: + """Dynamically add a task to fast crawler""" + + async def start_up(self) -> None: + """Manage start up actvity""" + + async def shut_down(self) -> None: + """Manage shut down activity""" + + +class TaskControllerProto(Protocol): # pragma: no cover + app: TaskApplicationProto + + def __init__(self, app: TaskApplicationProto): + """Initialize task application + + Args: + app (TaskProcessor): _description_ + """ + + async def all(self) -> list[Task]: + """ + Return all tasks from internal + """ + + async def add_task(self, task_func: Callable, settings: Task) -> None: + """ + Add tasks within internal python API + """ + + async def change_task_schedule(self, task_name: str, schedule: str) -> None: + """ + Reschedule a task + schedule: + - can be string + `every 2 seconds` + - can be cron + `*/2 * * * *` + """ + + async def toggle_task(self, task_name: str) -> None: + """ + Disables or enable one task + """ diff --git a/fastcrawler/schedule/schema.py b/fastcrawler/schedule/schema.py new file mode 100644 index 0000000..cd25853 --- /dev/null +++ b/fastcrawler/schedule/schema.py @@ -0,0 +1,101 @@ +import datetime +from typing import Literal + +from pydantic import BaseModel, Field # pylint: disable=no-name-in-module + +from .utilties import BaseCondition + + +class Task(BaseModel): + """Base class for Tasks. + + A task can be a function, command or other procedure that + does a specific thing. A task can be parametrized by supplying + + + Parameters + ---------- + name : str, optional + Name of the task. Ideally, all tasks + should have unique name. If None, the + return value of Task.get_default_name() + is used instead. + description : str, optional + Description of the task. This is purely + for task documentation purpose. + start_cond : BaseCondition, optional + Condition that when True the task + is to be started, by default AlwaysFalse() + end_cond : BaseCondition, optional + Condition that when True the task + will be terminated. Only works for for + tasks with execution='process' or 'thread' + if thread termination is implemented in + the task, by default AlwaysFalse() + execution : str, {'main', 'thread', 'process'}, default='process' + How the task is executed. Allowed values + 'main' (run on main thread & process), + 'thread' (run on another thread) and + 'process' (run on another process). + disabled : bool + If True, the task is not allowed to be run + regardless of the start_cond, + by default False + force_run : bool + If True, the task will be run once + regardless of the start_cond, + by default True + priority : int, optional + Priority of the task. Higher priority + tasks are first inspected whether they + can be executed. Can be any numeric value. + Setup tasks are recommended to have priority + >= 40 if they require loaded tasks, + >= 50 if they require loaded extensions. + By default 0 + timeout : str, int, timedelta, optional + If the task has not run in given timeout + the task will be terminated. Only applicable + for tasks with execution='process' or + with execution='thread'. + + Examples + -------- + Minimum example: + + >>> from fastcrawler.schedule.schema import Task + >>> class MyTask(Task): + ... def execute(self): + ... ... # What the task does. + ... return ... + + """ + + name: str | None = Field(description="Name of the task. Must be unique") + description: str | None = Field(description="Description of the task for documentation") + logger_name: str | None = Field( + description="Logger name to be used in logging the task record" + ) + execution: Literal["main", "async", "thread", "process"] | None = None + priority: int = 0 + disabled: bool = False + force_run: bool = False + status: Literal["run", "fail", "success", "terminate", "inaction", "crash"] | None = Field( + description="Latest status of the task", default=None + ) + timeout: datetime.timedelta | None = None + start_cond: BaseCondition | None = None + end_cond: BaseCondition | None = None + + _last_run: float | None = None + _last_success: float | None = None + _last_fail: float | None = None + _last_terminate: float | None = None + _last_inaction: float | None = None + _last_crash: float | None = None + + class Config: + arbitrary_types_allowed = True + protected_namespaces = () + underscore_attrs_are_private = True + validate_assignment = True diff --git a/fastcrawler/schedule/utilties.py b/fastcrawler/schedule/utilties.py new file mode 100644 index 0000000..00a7bee --- /dev/null +++ b/fastcrawler/schedule/utilties.py @@ -0,0 +1,53 @@ +from rocketry.core import BaseCondition as _BaseCondition + + +class BaseCondition(_BaseCondition): # pylint: disable=abstract-method + """A condition is a thing/occurence that should happen in + order to something happen. + + Conditions are used to determine whether a task can be started, + a task should be terminated or the scheduler should shut + down. Conditions are either true or false. + + A condition could answer for any of the following questions: + - Current time is as specified (ie. Monday afternoon). + - A given task has already run. + - The machine has at least a given amount of RAM. + - A specific file exists. + + Each condition should have the method ``__bool__`` specified + as minimum. This method should return ``True`` or ``False`` + depending on whether the condition holds or does not hold. + + Examples + -------- + + Minimum example: + + >>> from rocketry.core import BaseCondition + >>> class MyCondition(BaseCondition): + ... def __bool__(self): + ... ... # Code that defines state either + ... return True + + Complicated example with parser: + + >>> import os, re + >>> class IsFooBar(BaseCondition): + ... __parsers__ = { + ... re.compile(r"is foo '(?P.+)'"): "__init__" + ... } + ... + ... def __init__(self, outcome): + ... self.outcome = outcome + ... + ... def __bool__(self): + ... return self.outcome == "bar" + ... + ... def __repr__(self): + ... return f"IsFooBar('{self.outcome}')" + ... + >>> from rocketry.parse import parse_condition + >>> parse_condition("is foo 'bar'") + IsFooBar('bar') + """ diff --git a/requirements/base.txt b/requirements/base.txt index a08286b..b695798 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,4 +4,4 @@ playwright==1.34 cssselect==1.2.0 lxml redbird @ git+https://github.com/ManiMozaffar/red-bird@v2_but_v1 -rocketry @ git+https://github.com/ManiMozaffar/rocketry@master \ No newline at end of file +rocketry @ git+https://github.com/ManiMozaffar/rocketry@master diff --git a/setup.cfg b/setup.cfg index 9127bb0..92c23e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,12 +13,12 @@ classifiers = [options] python_requires = >=3.9 package_dir = - =src + =fastcrawler packages = find_namespace: include_package_data = True [options.packages.find] -where = src +where = fastcrawler exclude = test* @@ -43,7 +43,7 @@ addopts = --cov --strict-markers xfail_strict = True [coverage:run] -source = tsp_wrapper +source = fastcrawler branch = True [coverage:report] @@ -53,8 +53,8 @@ skip_covered = True [coverage:paths] source = - src/tsp_wrapper - */site-packages/tsp_wrapper + fastcrawler + */site-packages/fastcrawler [tox:tox] envlist = py39,py310 @@ -74,14 +74,14 @@ deps = pytest types-termcolor commands = - mypy --ignore-missing-imports {posargs:src test} + mypy --ignore-missing-imports {posargs:fastcrawler test} [testenv:format] skip_install = True deps = black commands = - black {posargs:--check --diff src test} + black {posargs:--check --diff fastcrawler test} [testenv:lint] skip_install = True @@ -89,7 +89,7 @@ deps = flake8 flake8-bugbear commands = - flake8 {posargs:src test} + flake8 {posargs:fastcrawler test} [testenv:docs] deps = @@ -101,9 +101,9 @@ commands = --module-first \ --separate \ -o docs/reference/ \ - src/tsp_wrapper/ \ - src/tsp_wrapper/*.c \ - src/tsp_wrapper/*.so + fastcrawler/ \ + fastcrawler/*.c \ + fastcrawler/*.so sphinx-build -n -W --keep-going -b html docs/ docs/_build/ [testenv:devdocs] @@ -117,7 +117,7 @@ commands = --module-first \ --separate \ -o docs/reference/ \ - src/tsp_wrapper/ \ - src/tsp_wrapper/*.c \ - src/tsp_wrapper/*.so + fastcrawler/ \ + fastcrawler/*.c \ + fastcrawler/*.so sphinx-autobuild -n -W -b html docs/ docs/_build/ diff --git a/test/conftest.py b/test/conftest.py index c4484a2..b8e7461 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,5 +1,4 @@ # pylint: skip-file -import asyncio from test.shared.engine import ( get_aiohttp_engine, get_cookies, @@ -13,6 +12,8 @@ import pytest import pytest_asyncio +from fastcrawler.schedule.adopter import RocketryApplication, RocketryManager + @pytest.fixture def html(): @@ -51,6 +52,16 @@ def cookies(): @pytest_asyncio.fixture() async def aiohttp_engine(): - engine = await get_aiohttp_engine() - yield engine - await engine.teardown() + setuped_engine = await get_aiohttp_engine() + yield setuped_engine + await setuped_engine.teardown() + + +@pytest.fixture(scope="function") +def task_app(): + yield RocketryApplication() + + +@pytest.fixture(scope="function") +def task_manager(task_app): + yield RocketryManager(task_app) diff --git a/test/shared/engine.py b/test/shared/engine.py index dab8f18..b12cfd1 100644 --- a/test/shared/engine.py +++ b/test/shared/engine.py @@ -1,5 +1,4 @@ # pylint: skip-file -import asyncio from random import choice from pydantic_settings import BaseSettings diff --git a/test/test_parser.py b/test/test_parser.py index 6b2f483..708779d 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -1,4 +1,5 @@ # pylint: skip-file +import pytest from test.shared.schema import ( EmailData, @@ -11,9 +12,6 @@ VeryNestedCSS, VeryNestedJson, ) - -import pytest - from fastcrawler.exceptions import ( ParserInvalidModelType, ParserValidationError, diff --git a/test/test_schedule.py b/test/test_schedule.py new file mode 100644 index 0000000..1a6bcf4 --- /dev/null +++ b/test/test_schedule.py @@ -0,0 +1,147 @@ +import asyncio + +import pytest + +from fastcrawler.schedule.adopter import RocketryApplication, RocketryManager, TaskNotFound +from fastcrawler.schedule.schema import Task + + +async def task_function(sleep_sec=5): + print(f"going to sleep for {sleep_sec} seconds") + await asyncio.sleep(sleep_sec) + + +sample_tasks = [ + Task( + name="tast_1", + description="Task 1 Description", + logger_name="test_task_1", + ), + Task( + name="tast_2", + description="Task 2 Description", + logger_name="test_task_2", + ), +] + + +def get_task(num=1): + return sample_tasks[num - 1] + + +@pytest.mark.asyncio +async def test_add_task_to_rocketry_application(task_app: RocketryApplication): + new_task_1 = get_task(1) + task_names = {task.name for task in (new_task_1,)} + await task_app.add_task(task_function, new_task_1) + session_tasks = task_app.task_lib.session.tasks + session_task_names = {session_task.name for session_task in session_tasks} + assert session_task_names == task_names, "The task not added correctly!" + + +@pytest.mark.asyncio +async def test_get_all_task_to_rocketry_application(task_app: RocketryApplication): + new_task_1 = get_task(1) + new_task_2 = get_task(2) + task_names = {task.name for task in (new_task_1, new_task_2)} + await task_app.add_task(task_function, new_task_1) + await task_app.add_task(task_function, new_task_2) + session_tasks = await task_app.get_all_tasks() + session_task_names = {session_task.name for session_task in session_tasks} + assert session_task_names == task_names, "The task not added correctly!" + + +@pytest.mark.asyncio +async def test_shutdown_rocketry_application(task_app: RocketryApplication): + new_task_1 = get_task(1) + await task_app.add_task(task_function, new_task_1) + await asyncio.sleep(1) + await task_app.shut_down() + await asyncio.sleep(1) + assert not task_app.task_lib.session.scheduler.is_alive + + +# @pytest.mark.asyncio +# async def test_serve_rocketry_application(task_app: RocketryApplication): +# new_task_1 = get_task(1) +# await task_app.add_task(task_function, new_task_1) +# await asyncio.sleep(1) +# await task_app.serve() +# await asyncio.sleep(1) +# assert task_app.task_lib.session.scheduler.is_alive + + +@pytest.mark.asyncio +async def test_add_task_to_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + task_names = {new_task_1.name} + await task_manager.add_task(task_function, new_task_1) + session_tasks = await task_manager.app.get_all_tasks() + session_task_names = {session_task.name for session_task in session_tasks} + assert session_task_names == task_names, "The task not added correctly!" + + +@pytest.mark.asyncio +async def test_all_tasks_from_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + new_task_2 = get_task(2) + task_names = {task.name for task in (new_task_1, new_task_2)} + await task_manager.add_task(task_function, new_task_1) + await task_manager.add_task(task_function, new_task_2) + session_tasks = await task_manager.all() + session_task_names = {session_task.name for session_task in session_tasks} + assert session_task_names == task_names, "The task not added correctly!" + + +@pytest.mark.asyncio +async def test_change_task_schedule_from_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + await task_manager.add_task(task_function, new_task_1) + # if any problem is encountered during change_task_schedule it should raise an exception + await task_manager.change_task_schedule(new_task_1.name, "*/2 * * * *") + assert True + + +@pytest.mark.asyncio +async def test_change_task_schedule_string_from_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + await task_manager.add_task(task_function, new_task_1) + # if any problem is encountered during change_task_schedule it should raise an exception + await task_manager.change_task_schedule(new_task_1.name, "every 2 seconds") + assert True + + +@pytest.mark.asyncio +async def test_fail_test_change_task_schedule_from_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + with pytest.raises(TaskNotFound): + await task_manager.add_task(task_function, new_task_1) + await task_manager.change_task_schedule("wrong_task_name", "every 2 seconds") + + +@pytest.mark.asyncio +async def test_toggle_task_not_disabled_from_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + await task_manager.add_task(task_function, new_task_1) + # if any problem is encountered during toggle_task it should raise an exception + await task_manager.toggle_task(new_task_1.name) + assert True + + +@pytest.mark.asyncio +async def test_toggle_task_disabled_from_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + new_task_1.disabled = True + await task_manager.add_task(task_function, new_task_1) + # if any problem is encountered during toggle_task it should raise an exception + await task_manager.toggle_task(new_task_1.name) + assert True + + +@pytest.mark.asyncio +async def test_toggle_task_not_found_from_manager(task_manager: RocketryManager): + new_task_1 = get_task(1) + await task_manager.add_task(task_function, new_task_1) + # if any problem is encountered during toggle_task it should raise an exception + with pytest.raises(TaskNotFound): + await task_manager.toggle_task("wromg_task_name")