-
Notifications
You must be signed in to change notification settings - Fork 2
♻️ Refactor FastCrawler core codebase and synced to FastAPI backend client #53
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c74001b
598da61
4a5bc4a
9ad55ff
c093f04
2dc6e3f
ec0f56f
57e2339
6fcb37c
5a5a8a4
d03eaa7
eb58272
e4f3487
18ba626
4ff7d37
97c49b3
092c7bf
053b224
61b24b5
cebf63f
264fd68
a0d98a4
27b1ecc
61c7be4
50f5789
b3aa62d
301ee98
bf4c9c6
8181518
be2a0ce
ee4e234
c46ece3
53e6c71
8a87b19
2843f5b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,20 @@ | ||
| import asyncio | ||
| from typing import Callable | ||
| from typing import Any, Coroutine | ||
|
|
||
| from fastcrawler.exceptions import NoCrawlerFoundError | ||
| from fastcrawler.exceptions import NoCrawlerFoundErrorError | ||
| from fastcrawler.schedule.adopter import ProcessController, RocketryApplication | ||
| from fastcrawler.schedule.contracts import ControllerProto | ||
| from fastcrawler.schedule.contracts import ControllerABC | ||
|
|
||
| from .process import Process | ||
|
|
||
|
|
||
| def list_process(crawlers: list[Process] | Process) -> list[Process]: | ||
| if isinstance(crawlers, Process): | ||
| return [crawlers] | ||
| else: | ||
| return crawlers | ||
|
|
||
|
|
||
| class FastCrawler: | ||
| """The client interface to start all crawlers. | ||
| Initialize all crawlers | ||
|
|
@@ -22,47 +29,48 @@ class FastCrawler: | |
|
|
||
| """ | ||
|
|
||
| controller: ControllerProto | None = None | ||
| controller: ControllerABC | None = None | ||
|
|
||
| def __init__( | ||
| self, | ||
| crawlers: list[Process] | Process, | ||
| controller: ControllerProto | None = None, | ||
| controller: ControllerABC | None = None, | ||
| ): | ||
| """Initialize FastCrawler with defined crawlers""" | ||
| ... | ||
| if isinstance(crawlers, Process): | ||
| self.crawlers = [ | ||
| crawlers, | ||
| ] | ||
| else: | ||
| self.crawlers = crawlers | ||
|
|
||
| self.crawlers = list_process(crawlers) | ||
| self.controller = controller or ProcessController(app=RocketryApplication()) | ||
| if not self.crawlers or len(self.crawlers) == 0: | ||
| raise NoCrawlerFoundError | ||
| raise NoCrawlerFoundErrorError | ||
|
|
||
| @property | ||
| def get_all_serves(self) -> list[Callable]: | ||
| def get_all_serves(self) -> list[Coroutine[Any, Any, None]]: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't use Any
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What type I should use then? |
||
| """get all application to be served""" | ||
| assert self.controller is not None | ||
| return [ | ||
| self.controller.app.serve(), | ||
| ] | ||
|
|
||
| async def serve(self) -> None: | ||
| """Serve protocol for uvicorn""" | ||
| """ | ||
| Serve protocol for uvicorn, useful with combination | ||
| with other tools if get_all_serves is customized | ||
| """ | ||
| await asyncio.gather(*self.get_all_serves) | ||
| return None | ||
|
|
||
| async def start(self, silent=True) -> None: | ||
| """Start all crawlers in background explicitly without schedule""" | ||
|
|
||
| # TODO: make here multi processing, for more than one process! | ||
| # or use rocketry to trigger the tasks, if possible :) | ||
| await asyncio.gather(*[crawler.start(silent) for crawler in self.crawlers]) | ||
| return None | ||
|
|
||
| async def run(self) -> None: | ||
| """Run all crawlers in background explicitly with schedule""" | ||
| for crawler in self.crawlers: | ||
| await crawler.add_spiders() | ||
| crawler.controller = self.controller | ||
| await crawler.add_spiders_to_controller() | ||
| await self.serve() | ||
| return None | ||
|
|
||
|
|
@@ -76,6 +84,7 @@ async def shutdown(self) -> None: | |
|
|
||
| async def _shutdown(self) -> None: | ||
| """Safe shut down event for application crawler""" | ||
| assert self.controller is not None | ||
| await self.shutdown() | ||
| await self.controller.shut_down() | ||
| return None | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,7 @@ | ||
| from uuid import uuid4 | ||
|
|
||
| from fastcrawler.core.spider import Spider | ||
| from fastcrawler.schedule.contracts import ControllerProto | ||
| from fastcrawler.schedule.contracts import ControllerABC | ||
| from fastcrawler.schedule.schema import Task | ||
|
|
||
|
|
||
|
|
@@ -14,7 +14,7 @@ class Process: | |
| def __init__( | ||
| self, | ||
| spider: Spider, | ||
| controller: ControllerProto | None = None, | ||
| controller: ControllerABC | None = None, | ||
| cond: str | Task | None = None, | ||
| *args, | ||
| **kwargs, | ||
|
|
@@ -23,15 +23,15 @@ def __init__( | |
|
|
||
| Args: | ||
| spider (Spider): _description_ | ||
| controller (None | ControllerProto, optional): _description_. Defaults to None. | ||
| controller (None | ControllerABC, optional): _description_. Defaults to None. | ||
| cond (Task | None, optional): _description_. Defaults to None. | ||
| """ | ||
| if isinstance(cond, Task): | ||
| self.task = cond | ||
| else: | ||
| self.task = Task( | ||
| start_cond=cond or "every 1 second", | ||
| name=spider.__class__.__name__ + str(uuid4()), | ||
| name=f"{uuid4()}@{spider.__class__.__name__}", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this needs lots of comments and examples
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will add doc src in future. |
||
| ) | ||
| self.args = args | ||
| self.kwargs = kwargs | ||
|
|
@@ -44,23 +44,26 @@ async def start(self, silent: bool = True) -> None: | |
| This method will disable scheduler temporarily to avoid duplicate running | ||
| """ | ||
| if self.controller: | ||
| await self.controller.toggle_task(self.task.name, new_status=False) | ||
| await self.controller.toggle_task(str(self.task.name), new_status=False) | ||
| await self.spider.start(silent=silent) | ||
| if self.controller: | ||
| await self.controller.toggle_task(self.task.name, new_status=True) | ||
| await self.controller.toggle_task(str(self.task.name), new_status=True) | ||
| return None | ||
|
|
||
| async def stop(self) -> None: | ||
| """Stop the crawling process""" | ||
| self.spider.is_stopped = True | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems like you need a command pattern here
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we? there's only 2 commands which has external APIs.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. even if you don't use the command pattern |
||
| for instance in self.spider.instances: | ||
| instance.is_stopped = True | ||
|
|
||
| if self.controller: | ||
| self.controller.toggle_task(self.task.name, new_status=False) | ||
| await self.controller.toggle_task(str(self.task.name), new_status=False) | ||
| return None | ||
|
|
||
| async def add_spiders(self) -> None: | ||
| async def add_spiders_to_controller(self) -> None: | ||
| """ | ||
| Run the crawling process | ||
| """ | ||
| assert self.controller is not None | ||
| if self.task: | ||
| await self.controller.add_task(self.spider.start, self.task) | ||
| else: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bad naming
bad design
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will be fixed in next PR.