Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
There is the new `Paginate` child class of the `Http` step class (easy access via `Http::get()->paginate()`). It takes an instance of the `PaginatorInterface` and uses it to iterate through pagination links. There is one implementation of that interface, the `SimpleWebsitePaginator`. The `Http::get()->paginate()` method uses it by default, when called just with a CSS selector to get pagination links. Paginators receive all loaded pages and implement the logic to find pagination links. The paginator class is also called before sending a request, with the request object that is about to be sent as an argument (`prepareRequest()`). This way, it should even be doable to implement more complex pagination functionality. For example when pagination is built using POST request with query strings in the request body.
- Loading branch information
Showing
11 changed files
with
648 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
<?php | ||
|
||
namespace Crwlr\Crawler\Steps\Loading\Http; | ||
|
||
use Crwlr\Crawler\Steps\Loading\Http; | ||
use Crwlr\Url\Url; | ||
use Generator; | ||
use Psr\Http\Message\StreamInterface; | ||
use Psr\Http\Message\UriInterface; | ||
|
||
class Paginate extends Http | ||
{ | ||
public function __construct( | ||
protected Http\PaginatorInterface $paginator, | ||
string $method = 'GET', | ||
array $headers = [], | ||
string|StreamInterface|null $body = null, | ||
string $httpVersion = '1.1', | ||
) { | ||
parent::__construct($method, $headers, $body, $httpVersion); | ||
} | ||
|
||
/** | ||
* @param UriInterface $input | ||
*/ | ||
protected function invoke(mixed $input): Generator | ||
{ | ||
$request = $this->paginator->prepareRequest($this->getRequestFromInputUri($input)); | ||
|
||
$response = $this->getResponseFromRequest($request); | ||
|
||
if ($response) { | ||
yield $response; | ||
} | ||
|
||
$this->paginator->processLoaded($input, $request, $response); | ||
|
||
while (!$this->paginator->hasFinished()) { | ||
$nextUrl = $this->paginator->getNextUrl(); | ||
|
||
if (!$nextUrl) { | ||
break; | ||
} | ||
|
||
$nextUrl = Url::parsePsr7($nextUrl); | ||
|
||
$request = $this->paginator->prepareRequest($this->getRequestFromInputUri($nextUrl), $response); | ||
|
||
$response = $this->getResponseFromRequest($request); | ||
|
||
if ($response) { | ||
yield $response; | ||
} | ||
|
||
$this->paginator->processLoaded($nextUrl, $request, $response); | ||
} | ||
|
||
if ($this->logger) { | ||
$this->paginator->logWhenFinished($this->logger); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
<?php | ||
|
||
namespace Crwlr\Crawler\Steps\Loading\Http; | ||
|
||
use Crwlr\Crawler\Steps\Html\DomQueryInterface; | ||
use Crwlr\Crawler\Steps\Loading\Http\Paginators\SimpleWebsitePaginator; | ||
|
||
class Paginator | ||
{ | ||
public const MAX_PAGES_DEFAULT = 1000; | ||
|
||
public static function simpleWebsite( | ||
string|DomQueryInterface $paginationLinksSelector, | ||
int $maxPages = self::MAX_PAGES_DEFAULT, | ||
): SimpleWebsitePaginator { | ||
return new SimpleWebsitePaginator($paginationLinksSelector, $maxPages); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
<?php | ||
|
||
namespace Crwlr\Crawler\Steps\Loading\Http; | ||
|
||
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; | ||
use Psr\Http\Message\RequestInterface; | ||
use Psr\Http\Message\UriInterface; | ||
use Psr\Log\LoggerInterface; | ||
|
||
interface PaginatorInterface | ||
{ | ||
public function hasFinished(): bool; | ||
|
||
public function getNextUrl(): ?string; | ||
|
||
public function prepareRequest( | ||
RequestInterface $request, | ||
?RespondedRequest $previousResponse = null | ||
): RequestInterface; | ||
|
||
public function processLoaded( | ||
UriInterface $url, | ||
RequestInterface $request, | ||
?RespondedRequest $respondedRequest, | ||
): void; | ||
|
||
public function logWhenFinished(LoggerInterface $logger): void; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<?php | ||
|
||
namespace Crwlr\Crawler\Steps\Loading\Http\Paginators; | ||
|
||
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; | ||
use Crwlr\Crawler\Steps\Loading\Http\Paginator; | ||
use Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface; | ||
use Psr\Http\Message\RequestInterface; | ||
|
||
abstract class AbstractPaginator implements PaginatorInterface | ||
{ | ||
public function __construct(protected int $maxPages = Paginator::MAX_PAGES_DEFAULT) | ||
{ | ||
} | ||
|
||
public function prepareRequest( | ||
RequestInterface $request, | ||
?RespondedRequest $previousResponse = null, | ||
): RequestInterface { | ||
return $request; | ||
} | ||
} |
169 changes: 169 additions & 0 deletions
169
src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
<?php | ||
|
||
namespace Crwlr\Crawler\Steps\Loading\Http\Paginators; | ||
|
||
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; | ||
use Crwlr\Crawler\Steps\Dom; | ||
use Crwlr\Crawler\Steps\Html\DomQuery; | ||
use Crwlr\Crawler\Steps\Html\DomQueryInterface; | ||
use Crwlr\Crawler\Steps\Loading\Http; | ||
use Crwlr\Url\Url; | ||
use Exception; | ||
use Psr\Http\Message\RequestInterface; | ||
use Psr\Http\Message\UriInterface; | ||
use Psr\Log\LoggerInterface; | ||
use Symfony\Component\DomCrawler\Crawler; | ||
|
||
class SimpleWebsitePaginator extends AbstractPaginator | ||
{ | ||
/** | ||
* @var array<string, string> | ||
*/ | ||
protected array $found = []; | ||
|
||
/** | ||
* @var array<string, true> | ||
*/ | ||
protected array $loaded = []; | ||
|
||
protected int $loadedPagesCount = 0; | ||
|
||
protected DomQueryInterface $paginationLinksSelector; | ||
|
||
public function __construct(string|DomQueryInterface $paginationLinksSelector, int $maxPages = 1000) | ||
{ | ||
if (is_string($paginationLinksSelector)) { | ||
$this->paginationLinksSelector = Dom::cssSelector($paginationLinksSelector); | ||
} else { | ||
$this->paginationLinksSelector = $paginationLinksSelector; | ||
} | ||
|
||
parent::__construct($maxPages); | ||
} | ||
|
||
public function hasFinished(): bool | ||
{ | ||
return $this->loadedPagesCount >= $this->maxPages || empty($this->found); | ||
} | ||
|
||
public function getNextUrl(): ?string | ||
{ | ||
return array_shift($this->found); | ||
} | ||
|
||
/** | ||
* @throws Exception | ||
*/ | ||
public function processLoaded( | ||
UriInterface $url, | ||
RequestInterface $request, | ||
?RespondedRequest $respondedRequest, | ||
): void { | ||
$this->loaded[$url->__toString()] = true; | ||
|
||
$this->loadedPagesCount++; | ||
|
||
if ($respondedRequest) { | ||
foreach ($respondedRequest->redirects() as $redirectUrl) { | ||
$this->loaded[$redirectUrl] = true; | ||
} | ||
|
||
$this->getPaginationLinksFromResponse($respondedRequest); | ||
} | ||
} | ||
|
||
public function logWhenFinished(LoggerInterface $logger): void | ||
{ | ||
if ($this->loadedPagesCount >= $this->maxPages && !empty($this->found)) { | ||
$logger->warning('Max pages limit reached'); | ||
} else { | ||
$logger->info('All found pagination links loaded'); | ||
} | ||
} | ||
|
||
/** | ||
* @throws Exception | ||
*/ | ||
protected function getPaginationLinksFromResponse(RespondedRequest $respondedRequest): void | ||
{ | ||
$responseBody = Http::getBodyString($respondedRequest); | ||
|
||
$dom = new Crawler($responseBody); | ||
|
||
$paginationLinksElements = $this->paginationLinksSelector->filter($dom); | ||
|
||
foreach ($paginationLinksElements as $paginationLinksElement) { | ||
$paginationLinksElement = new Crawler($paginationLinksElement); | ||
|
||
$this->addFoundUrlFromLinkElement( | ||
$paginationLinksElement, | ||
$dom, | ||
$respondedRequest->effectiveUri(), | ||
); | ||
|
||
foreach ($paginationLinksElement->filter('a') as $linkInPaginationLinksElement) { | ||
$linkInPaginationLinksElement = new Crawler($linkInPaginationLinksElement); | ||
|
||
$this->addFoundUrlFromLinkElement( | ||
$linkInPaginationLinksElement, | ||
$dom, | ||
$respondedRequest->effectiveUri(), | ||
); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* @throws Exception | ||
*/ | ||
protected function addFoundUrlFromLinkElement( | ||
Crawler $linkElement, | ||
Crawler $document, | ||
string $documentUrl, | ||
): void { | ||
if ($this->isRelevantLinkElement($linkElement)) { | ||
$url = $this->getAbsoluteUrlFromLinkElement($linkElement, $document, $documentUrl); | ||
|
||
$this->addFoundUrl($url); | ||
} | ||
} | ||
|
||
/** | ||
* @throws Exception | ||
*/ | ||
protected function getAbsoluteUrlFromLinkElement( | ||
Crawler $linkElement, | ||
Crawler $document, | ||
string $documentUrl, | ||
): string { | ||
$baseUrl = Url::parse($documentUrl); | ||
|
||
$baseHref = DomQuery::getBaseHrefFromDocument($document); | ||
|
||
if ($baseHref) { | ||
$baseUrl = $baseUrl->resolve($baseHref); | ||
} | ||
|
||
$linkHref = $linkElement->attr('href') ?? ''; | ||
|
||
return $baseUrl->resolve($linkHref)->__toString(); | ||
} | ||
|
||
protected function isRelevantLinkElement(Crawler $element): bool | ||
{ | ||
if ($element->nodeName() !== 'a') { | ||
return false; | ||
} | ||
|
||
$href = $element->attr('href'); | ||
|
||
return !empty($href) && !str_starts_with($href, '#'); | ||
} | ||
|
||
protected function addFoundUrl(string $url): void | ||
{ | ||
if (!isset($this->found[$url]) && !isset($this->loaded[$url])) { | ||
$this->found[$url] = $url; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.