Skip to content

Commit

Permalink
do not process redirected URLs outside base domains
Browse files Browse the repository at this point in the history
  • Loading branch information
fritzmg committed Feb 27, 2022
1 parent da55d19 commit 2466907
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ public function needsContent(CrawlUri $crawlUri, ResponseInterface $response, Ch
return SubscriberInterface::DECISION_NEGATIVE;
}

// Skip any redirected URLs that are now outside our base hosts
$actualHost = parse_url($response->getInfo('url'), PHP_URL_HOST);

if ($crawlUri->getUri()->getHost() !== $actualHost && !$this->escargot->getBaseUris()->containsHost($actualHost)) {
return SubscriberInterface::DECISION_NEGATIVE;
}

++$this->stats['ok'];

// When URI is part of the base uri collection, request content.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,19 @@ public function needsContent(CrawlUri $crawlUri, ResponseInterface $response, Ch
return SubscriberInterface::DECISION_NEGATIVE;
}

// Skip any redirected URLs that are now outside our base hosts
$actualHost = parse_url($response->getInfo('url'), PHP_URL_HOST);

if ($crawlUri->getUri()->getHost() !== $actualHost && !$this->escargot->getBaseUris()->containsHost($actualHost)) {
$this->logWithCrawlUri(
$crawlUri,
LogLevel::DEBUG,
'Did not index because it was not part of the base URI collection.'
);

return SubscriberInterface::DECISION_NEGATIVE;
}

// No HTML, no index
if (!Util::isOfContentType($response, 'text/html')) {
$this->logWithCrawlUri(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ public function needsContentProvider(): \Generator

yield 'Test does not report successful responses if url not in base collection' => [
new CrawlUri(new Uri('https://github.com'), 0),
$this->getResponse(),
$this->getResponse(200, 'https://github.com'),
SubscriberInterface::DECISION_NEGATIVE,
'',
'',
Expand All @@ -222,6 +222,15 @@ public function needsContentProvider(): \Generator
['ok' => 2, 'error' => 0],
['ok' => 1, 'error' => 0],
];

yield 'Test does not report redirected responses outside the target domain' => [
new CrawlUri(new Uri('https://contao.org'), 0),
$this->getResponse(200, 'https://example.com'),
SubscriberInterface::DECISION_NEGATIVE,
'',
'',
['ok' => 0, 'error' => 0],
];
}

/**
Expand Down Expand Up @@ -361,7 +370,7 @@ public function onHttpExceptionProvider(): \Generator
/**
* @return ResponseInterface&MockObject
*/
private function getResponse(int $statusCode = 200): ResponseInterface
private function getResponse(int $statusCode = 200, string $url = 'https://contao.org'): ResponseInterface
{
$response = $this->createMock(ResponseInterface::class);
$response
Expand All @@ -372,13 +381,13 @@ private function getResponse(int $statusCode = 200): ResponseInterface
$response
->method('getInfo')
->willReturnCallback(
static function (string $key) use ($statusCode) {
static function (string $key) use ($statusCode, $url) {
if ('http_code' === $key) {
return $statusCode;
}

if ('url' === $key) {
return '';
return $url;
}

if ('response_headers' === $key) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,13 @@ public function needsContentProvider(): \Generator
$this->getResponse(true),
SubscriberInterface::DECISION_POSITIVE,
];

yield 'Test skips redirected responses outside the target domain' => [
$this->getResponse(false, 200, 'https://example.com'),
SubscriberInterface::DECISION_NEGATIVE,
LogLevel::DEBUG,
'Did not index because it was not part of the base URI collection.',
];
}

/**
Expand Down Expand Up @@ -462,7 +469,7 @@ public function onHttpExceptionProvider(): \Generator
];
}

private function getResponse(bool $asHtml, int $statusCode = 200): ResponseInterface
private function getResponse(bool $asHtml, int $statusCode = 200, string $url = 'https://contao.org'): ResponseInterface
{
$headers = $asHtml ? ['content-type' => ['text/html']] : [];

Expand All @@ -480,11 +487,14 @@ private function getResponse(bool $asHtml, int $statusCode = 200): ResponseInter
$response
->method('getInfo')
->willReturnCallback(
static function (string $key) use ($statusCode) {
static function (string $key) use ($statusCode, $url) {
switch ($key) {
case 'http_code':
return $statusCode;

case 'url':
return $url;

case 'response_headers':
return [];

Expand Down

0 comments on commit 2466907

Please sign in to comment.