Skip to content

Commit

Permalink
New functionality to paginate
Browse files Browse the repository at this point in the history
There is the new `Paginate` child class of the `Http` step class (easy
access via `Http::get()->paginate()`). It takes an instance of the
`PaginatorInterface` and uses it to iterate through pagination links.
There is one implementation of that interface, the
`SimpleWebsitePaginator`. The `Http::get()->paginate()` method uses it
by default, when called just with a CSS selector to get pagination
links. Paginators receive all loaded pages and implement the logic to
find pagination links. The paginator class is also called before sending
a request, with the request object that is about to be sent as an
argument (`prepareRequest()`). This way, it should even be doable to
implement more complex pagination functionality. For example when
pagination is built using POST request with query strings in the request
body.
  • Loading branch information
otsch committed Oct 26, 2022
1 parent 18c4c39 commit 88d45db
Show file tree
Hide file tree
Showing 11 changed files with 648 additions and 11 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
* New functionality to paginate: There is the new `Paginate` child class of the `Http` step class (easy access via `Http::get()->paginate()`). It takes an instance of the `PaginatorInterface` and uses it to iterate through pagination links. There is one implementation of that interface, the `SimpleWebsitePaginator`. The `Http::get()->paginate()` method uses it by default, when called just with a CSS selector to get pagination links. Paginators receive all loaded pages and implement the logic to find pagination links. The paginator class is also called before sending a request, with the request object that is about to be sent as an argument (`prepareRequest()`). This way, it should even be doable to implement more complex pagination functionality. For example when pagination is built using POST request with query strings in the request body.
* New methods `stopOnErrorResponse()` and `yieldErrorResponses()` that can be used with `Http` steps. By calling `stopOnErrorResponse()` the step will throw a `LoadingException` when a response has a 4xx or 5xx status code. By calling the `yieldErrorResponse()` even error responses will be yielded and passed on to the next steps (this was default behaviour until this version. See the breaking change below).

### Changed
Expand Down
40 changes: 35 additions & 5 deletions src/Steps/Loading/Http.php
Expand Up @@ -3,6 +3,9 @@
namespace Crwlr\Crawler\Steps\Loading;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginate;
use Crwlr\Crawler\Steps\Loading\Http\Paginator;
use Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface;
use Exception;
use Generator;
use GuzzleHttp\Psr7\Request;
Expand Down Expand Up @@ -109,6 +112,17 @@ public static function getBodyString(MessageInterface|RespondedRequest $message)
return $contents;
}

public function paginate(
PaginatorInterface|string $paginator,
int $defaultPaginatorMaxPages = Paginator::MAX_PAGES_DEFAULT
): Paginate {
if (is_string($paginator)) {
$paginator = Paginator::simpleWebsite($paginator, $defaultPaginatorMaxPages);
}

return new Paginate($paginator, $this->method, $this->headers, $this->body, $this->httpVersion);
}

public function stopOnErrorResponse(): static
{
$this->stopOnErrorResponse = true;
Expand Down Expand Up @@ -136,22 +150,38 @@ protected function validateAndSanitizeInput(mixed $input): UriInterface
* @throws Exception
*/
protected function invoke(mixed $input): Generator
{
$response = $this->getResponseFromInputUri($input);

if ($response) {
yield $response;
}
}

protected function getResponseFromInputUri(UriInterface $input): ?RespondedRequest
{
$request = $this->getRequestFromInputUri($input);

return $this->getResponseFromRequest($request);
}

protected function getRequestFromInputUri(UriInterface $uri): RequestInterface
{
return new Request($this->method, $uri, $this->headers, $this->body, $this->httpVersion);
}

protected function getResponseFromRequest(RequestInterface $request): ?RespondedRequest
{
if ($this->stopOnErrorResponse) {
$response = $this->loader->loadOrFail($request);
} else {
$response = $this->loader->load($request);
}

if ($response !== null && ($response->response->getStatusCode() < 400 || $this->yieldErrorResponses)) {
yield $response;
return $response;
}
}

protected function getRequestFromInputUri(UriInterface $uri): RequestInterface
{
return new Request($this->method, $uri, $this->headers, $this->body, $this->httpVersion);
return null;
}
}
62 changes: 62 additions & 0 deletions src/Steps/Loading/Http/Paginate.php
@@ -0,0 +1,62 @@
<?php

namespace Crwlr\Crawler\Steps\Loading\Http;

use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Url\Url;
use Generator;
use Psr\Http\Message\StreamInterface;
use Psr\Http\Message\UriInterface;

class Paginate extends Http
{
public function __construct(
protected Http\PaginatorInterface $paginator,
string $method = 'GET',
array $headers = [],
string|StreamInterface|null $body = null,
string $httpVersion = '1.1',
) {
parent::__construct($method, $headers, $body, $httpVersion);
}

/**
* @param UriInterface $input
*/
protected function invoke(mixed $input): Generator
{
$request = $this->paginator->prepareRequest($this->getRequestFromInputUri($input));

$response = $this->getResponseFromRequest($request);

if ($response) {
yield $response;
}

$this->paginator->processLoaded($input, $request, $response);

while (!$this->paginator->hasFinished()) {
$nextUrl = $this->paginator->getNextUrl();

if (!$nextUrl) {
break;
}

$nextUrl = Url::parsePsr7($nextUrl);

$request = $this->paginator->prepareRequest($this->getRequestFromInputUri($nextUrl), $response);

$response = $this->getResponseFromRequest($request);

if ($response) {
yield $response;
}

$this->paginator->processLoaded($nextUrl, $request, $response);
}

if ($this->logger) {
$this->paginator->logWhenFinished($this->logger);
}
}
}
18 changes: 18 additions & 0 deletions src/Steps/Loading/Http/Paginator.php
@@ -0,0 +1,18 @@
<?php

namespace Crwlr\Crawler\Steps\Loading\Http;

use Crwlr\Crawler\Steps\Html\DomQueryInterface;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\SimpleWebsitePaginator;

class Paginator
{
public const MAX_PAGES_DEFAULT = 1000;

public static function simpleWebsite(
string|DomQueryInterface $paginationLinksSelector,
int $maxPages = self::MAX_PAGES_DEFAULT,
): SimpleWebsitePaginator {
return new SimpleWebsitePaginator($paginationLinksSelector, $maxPages);
}
}
28 changes: 28 additions & 0 deletions src/Steps/Loading/Http/PaginatorInterface.php
@@ -0,0 +1,28 @@
<?php

namespace Crwlr\Crawler\Steps\Loading\Http;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;

interface PaginatorInterface
{
public function hasFinished(): bool;

public function getNextUrl(): ?string;

public function prepareRequest(
RequestInterface $request,
?RespondedRequest $previousResponse = null
): RequestInterface;

public function processLoaded(
UriInterface $url,
RequestInterface $request,
?RespondedRequest $respondedRequest,
): void;

public function logWhenFinished(LoggerInterface $logger): void;
}
22 changes: 22 additions & 0 deletions src/Steps/Loading/Http/Paginators/AbstractPaginator.php
@@ -0,0 +1,22 @@
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginator;
use Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface;
use Psr\Http\Message\RequestInterface;

abstract class AbstractPaginator implements PaginatorInterface
{
public function __construct(protected int $maxPages = Paginator::MAX_PAGES_DEFAULT)
{
}

public function prepareRequest(
RequestInterface $request,
?RespondedRequest $previousResponse = null,
): RequestInterface {
return $request;
}
}
169 changes: 169 additions & 0 deletions src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php
@@ -0,0 +1,169 @@
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\DomQueryInterface;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Url\Url;
use Exception;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Symfony\Component\DomCrawler\Crawler;

class SimpleWebsitePaginator extends AbstractPaginator
{
/**
* @var array<string, string>
*/
protected array $found = [];

/**
* @var array<string, true>
*/
protected array $loaded = [];

protected int $loadedPagesCount = 0;

protected DomQueryInterface $paginationLinksSelector;

public function __construct(string|DomQueryInterface $paginationLinksSelector, int $maxPages = 1000)
{
if (is_string($paginationLinksSelector)) {
$this->paginationLinksSelector = Dom::cssSelector($paginationLinksSelector);
} else {
$this->paginationLinksSelector = $paginationLinksSelector;
}

parent::__construct($maxPages);
}

public function hasFinished(): bool
{
return $this->loadedPagesCount >= $this->maxPages || empty($this->found);
}

public function getNextUrl(): ?string
{
return array_shift($this->found);
}

/**
* @throws Exception
*/
public function processLoaded(
UriInterface $url,
RequestInterface $request,
?RespondedRequest $respondedRequest,
): void {
$this->loaded[$url->__toString()] = true;

$this->loadedPagesCount++;

if ($respondedRequest) {
foreach ($respondedRequest->redirects() as $redirectUrl) {
$this->loaded[$redirectUrl] = true;
}

$this->getPaginationLinksFromResponse($respondedRequest);
}
}

public function logWhenFinished(LoggerInterface $logger): void
{
if ($this->loadedPagesCount >= $this->maxPages && !empty($this->found)) {
$logger->warning('Max pages limit reached');
} else {
$logger->info('All found pagination links loaded');
}
}

/**
* @throws Exception
*/
protected function getPaginationLinksFromResponse(RespondedRequest $respondedRequest): void
{
$responseBody = Http::getBodyString($respondedRequest);

$dom = new Crawler($responseBody);

$paginationLinksElements = $this->paginationLinksSelector->filter($dom);

foreach ($paginationLinksElements as $paginationLinksElement) {
$paginationLinksElement = new Crawler($paginationLinksElement);

$this->addFoundUrlFromLinkElement(
$paginationLinksElement,
$dom,
$respondedRequest->effectiveUri(),
);

foreach ($paginationLinksElement->filter('a') as $linkInPaginationLinksElement) {
$linkInPaginationLinksElement = new Crawler($linkInPaginationLinksElement);

$this->addFoundUrlFromLinkElement(
$linkInPaginationLinksElement,
$dom,
$respondedRequest->effectiveUri(),
);
}
}
}

/**
* @throws Exception
*/
protected function addFoundUrlFromLinkElement(
Crawler $linkElement,
Crawler $document,
string $documentUrl,
): void {
if ($this->isRelevantLinkElement($linkElement)) {
$url = $this->getAbsoluteUrlFromLinkElement($linkElement, $document, $documentUrl);

$this->addFoundUrl($url);
}
}

/**
* @throws Exception
*/
protected function getAbsoluteUrlFromLinkElement(
Crawler $linkElement,
Crawler $document,
string $documentUrl,
): string {
$baseUrl = Url::parse($documentUrl);

$baseHref = DomQuery::getBaseHrefFromDocument($document);

if ($baseHref) {
$baseUrl = $baseUrl->resolve($baseHref);
}

$linkHref = $linkElement->attr('href') ?? '';

return $baseUrl->resolve($linkHref)->__toString();
}

protected function isRelevantLinkElement(Crawler $element): bool
{
if ($element->nodeName() !== 'a') {
return false;
}

$href = $element->attr('href');

return !empty($href) && !str_starts_with($href, '#');
}

protected function addFoundUrl(string $url): void
{
if (!isset($this->found[$url]) && !isset($this->loaded[$url])) {
$this->found[$url] = $url;
}
}
}
6 changes: 3 additions & 3 deletions src/Steps/Loading/HttpCrawl.php
Expand Up @@ -160,11 +160,11 @@ protected function loadUrls(): Generator
foreach ($this->urls as $url => $yieldResponse) {
$uri = Url::parsePsr7($url);

$response = $this->loader->load($this->getRequestFromInputUri($uri));

$this->addLoadedUrlsFromResponse($response);
$response = $this->getResponseFromInputUri($uri);

if ($response !== null) {
$this->addLoadedUrlsFromResponse($response);

if ($yieldResponse['yield'] === true) {
yield $response;
}
Expand Down

0 comments on commit 88d45db

Please sign in to comment.