Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c74001b
🐛 Fix: Controller injection to spiders from app
ManiMozaffar Jul 27, 2023
598da61
🐛 Fix Rocketry proto impl
ManiMozaffar Jul 27, 2023
4a5bc4a
♻️ Refactor interface to use list instead of set
ManiMozaffar Jul 28, 2023
9ad55ff
🎨 Refactor design of engine
ManiMozaffar Jul 29, 2023
c093f04
✅ Update new engine designs
ManiMozaffar Jul 29, 2023
2dc6e3f
🎨 Refactor spider w.r.t new engine design
ManiMozaffar Jul 29, 2023
ec0f56f
✨ Add batching requests
ManiMozaffar Jul 29, 2023
57e2339
✅ Add tests for batching
ManiMozaffar Jul 29, 2023
6fcb37c
✅ Refactor existing test designs
ManiMozaffar Jul 29, 2023
5a5a8a4
✨ Add max_depth to control recursion
ManiMozaffar Jul 29, 2023
d03eaa7
✅ Add tests for max_depth
ManiMozaffar Jul 29, 2023
eb58272
✨ Add Sleep interval between request (Optional)
ManiMozaffar Jul 29, 2023
e4f3487
✅ Add test for interval between request
ManiMozaffar Jul 29, 2023
18ba626
✨ Add cycle_sleep_interval & max_request_count
ManiMozaffar Jul 29, 2023
4ff7d37
⚰️ Remove old aio engine design
ManiMozaffar Jul 29, 2023
97c49b3
♻️ Refactor AioEngine to send data or json base on contents
ManiMozaffar Jul 30, 2023
092c7bf
✅ Refactor aio engine tests for con limit
ManiMozaffar Jul 30, 2023
053b224
🐛 Fix await a Coroutine
aerosadegh Aug 2, 2023
61b24b5
🐛 Fix use ssl instead verify_ssl key in aiohttp
aerosadegh Aug 3, 2023
cebf63f
✅ Update test approx time to pass
aerosadegh Aug 3, 2023
264fd68
🐛 Fix list of tasks return in the `RocketryApplication`
aerosadegh Aug 15, 2023
a0d98a4
⚡️ Replace if statement by an expression
aerosadegh Aug 15, 2023
27b1ecc
🚑 Fix toggle task new_status
aerosadegh Aug 16, 2023
61c7be4
⚡️ Change task schedule returns task instead of None
aerosadegh Aug 17, 2023
50f5789
♻️ Refactor core's structure for typing improvements
ManiMozaffar Aug 18, 2023
b3aa62d
♻️ Refactor task manager structure for typing improvements
ManiMozaffar Aug 18, 2023
301ee98
♻️ Refactor engine to use dataclass and have better typings
ManiMozaffar Aug 18, 2023
bf4c9c6
🐛 Fix minor issue with task registry
ManiMozaffar Aug 19, 2023
8181518
♻️ Refactor core with new design
ManiMozaffar Aug 19, 2023
be2a0ce
🏷️ Improve explicit typing for URL
ManiMozaffar Aug 19, 2023
ee4e234
♻️ Refactor tests to pass with new design
ManiMozaffar Aug 19, 2023
c46ece3
🎨 Add customized exception for bad configuration
ManiMozaffar Aug 19, 2023
53e6c71
:recycle: Rename Proto to ABC
ManiMozaffar Aug 19, 2023
8a87b19
:recycle: Refactor task to be immutable
ManiMozaffar Aug 19, 2023
2843f5b
:recycle: Refactor class variable names to be PEP-8 friendly
ManiMozaffar Aug 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs_src/processor/tutorial001.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ async def main():
cond="every 1 second",
controller=ProcessController(app=RocketryApplication()),
)
await process.add_spiders()
await process.add_spiders_to_controller()
assert len(await process.controller.app.get_all_tasks()) == 1
await process.start(silent=False)

Expand Down
43 changes: 26 additions & 17 deletions fastcrawler/core/app.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import asyncio
from typing import Callable
from typing import Any, Coroutine

from fastcrawler.exceptions import NoCrawlerFoundError
from fastcrawler.exceptions import NoCrawlerFoundErrorError
from fastcrawler.schedule.adopter import ProcessController, RocketryApplication
from fastcrawler.schedule.contracts import ControllerProto
from fastcrawler.schedule.contracts import ControllerABC

from .process import Process


def list_process(crawlers: list[Process] | Process) -> list[Process]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bad naming
bad design

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will be fixed in next PR.

if isinstance(crawlers, Process):
return [crawlers]
else:
return crawlers


class FastCrawler:
"""The client interface to start all crawlers.
Initialize all crawlers
Expand All @@ -22,47 +29,48 @@ class FastCrawler:

"""

controller: ControllerProto | None = None
controller: ControllerABC | None = None

def __init__(
self,
crawlers: list[Process] | Process,
controller: ControllerProto | None = None,
controller: ControllerABC | None = None,
):
"""Initialize FastCrawler with defined crawlers"""
...
if isinstance(crawlers, Process):
self.crawlers = [
crawlers,
]
else:
self.crawlers = crawlers

self.crawlers = list_process(crawlers)
self.controller = controller or ProcessController(app=RocketryApplication())
if not self.crawlers or len(self.crawlers) == 0:
raise NoCrawlerFoundError
raise NoCrawlerFoundErrorError

@property
def get_all_serves(self) -> list[Callable]:
def get_all_serves(self) -> list[Coroutine[Any, Any, None]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't use Any

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What type I should use then?
I have to look up in uvicorn to check the protocol, then define the servable coroutine.

"""get all application to be served"""
assert self.controller is not None
return [
self.controller.app.serve(),
]

async def serve(self) -> None:
"""Serve protocol for uvicorn"""
"""
Serve protocol for uvicorn, useful with combination
with other tools if get_all_serves is customized
"""
await asyncio.gather(*self.get_all_serves)
return None

async def start(self, silent=True) -> None:
"""Start all crawlers in background explicitly without schedule"""

# TODO: make here multi processing, for more than one process!
# or use rocketry to trigger the tasks, if possible :)
await asyncio.gather(*[crawler.start(silent) for crawler in self.crawlers])
return None

async def run(self) -> None:
"""Run all crawlers in background explicitly with schedule"""
for crawler in self.crawlers:
await crawler.add_spiders()
crawler.controller = self.controller
await crawler.add_spiders_to_controller()
await self.serve()
return None

Expand All @@ -76,6 +84,7 @@ async def shutdown(self) -> None:

async def _shutdown(self) -> None:
"""Safe shut down event for application crawler"""
assert self.controller is not None
await self.shutdown()
await self.controller.shut_down()
return None
21 changes: 12 additions & 9 deletions fastcrawler/core/process.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from uuid import uuid4

from fastcrawler.core.spider import Spider
from fastcrawler.schedule.contracts import ControllerProto
from fastcrawler.schedule.contracts import ControllerABC
from fastcrawler.schedule.schema import Task


Expand All @@ -14,7 +14,7 @@ class Process:
def __init__(
self,
spider: Spider,
controller: ControllerProto | None = None,
controller: ControllerABC | None = None,
cond: str | Task | None = None,
*args,
**kwargs,
Expand All @@ -23,15 +23,15 @@ def __init__(

Args:
spider (Spider): _description_
controller (None | ControllerProto, optional): _description_. Defaults to None.
controller (None | ControllerABC, optional): _description_. Defaults to None.
cond (Task | None, optional): _description_. Defaults to None.
"""
if isinstance(cond, Task):
self.task = cond
else:
self.task = Task(
start_cond=cond or "every 1 second",
name=spider.__class__.__name__ + str(uuid4()),
name=f"{uuid4()}@{spider.__class__.__name__}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs lots of comments and examples

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will add doc src in future.

)
self.args = args
self.kwargs = kwargs
Expand All @@ -44,23 +44,26 @@ async def start(self, silent: bool = True) -> None:
This method will disable scheduler temporarily to avoid duplicate running
"""
if self.controller:
await self.controller.toggle_task(self.task.name, new_status=False)
await self.controller.toggle_task(str(self.task.name), new_status=False)
await self.spider.start(silent=silent)
if self.controller:
await self.controller.toggle_task(self.task.name, new_status=True)
await self.controller.toggle_task(str(self.task.name), new_status=True)
return None

async def stop(self) -> None:
"""Stop the crawling process"""
self.spider.is_stopped = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like you need a command pattern here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we? there's only 2 commands which has external APIs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even if you don't use the command pattern
I think stopping a spider should be a command not a variable

for instance in self.spider.instances:
instance.is_stopped = True

if self.controller:
self.controller.toggle_task(self.task.name, new_status=False)
await self.controller.toggle_task(str(self.task.name), new_status=False)
return None

async def add_spiders(self) -> None:
async def add_spiders_to_controller(self) -> None:
"""
Run the crawling process
"""
assert self.controller is not None
if self.task:
await self.controller.add_task(self.spider.start, self.task)
else:
Expand Down
Loading