Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Escargot 0.6 compat and skip broken link checker #1788

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
"symfony/var-dumper": "4.4.*",
"symfony/web-profiler-bundle": "4.4.*",
"symfony/yaml": "4.4.*",
"terminal42/escargot": "^0.5.1",
"terminal42/escargot": "^0.6.0",
"terminal42/service-annotation-bundle": "^1.0",
"toflar/psr6-symfony-http-cache-store": "^2.1",
"true/punycode": "^2.1",
Expand Down
2 changes: 1 addition & 1 deletion core-bundle/composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
"symfony/twig-bundle": "4.4.*",
"symfony/var-dumper": "4.4.*",
"symfony/yaml": "4.4.*",
"terminal42/escargot": "^0.5.1",
"terminal42/escargot": "^0.6.0",
"terminal42/service-annotation-bundle": "^1.0",
"true/punycode": "^2.1",
"twig/twig": "^2.7",
Expand Down
12 changes: 8 additions & 4 deletions core-bundle/src/Crawl/Escargot/Factory.php
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,10 @@ public function getRootPageUriCollection(): BaseUriCollection
*/
public function create(BaseUriCollection $baseUris, QueueInterface $queue, array $selectedSubscribers, array $clientOptions = []): Escargot
{
$escargot = Escargot::create($baseUris, $queue, $this->createHttpClient($clientOptions));
$escargot = $escargot->withUserAgent(self::USER_AGENT);
$escargot = Escargot::create($baseUris, $queue)
->withHttpClient($this->createHttpClient($clientOptions))
->withUserAgent(self::USER_AGENT)
;

$this->registerDefaultSubscribers($escargot);
$this->registerSubscribers($escargot, $this->validateSubscribers($selectedSubscribers));
Expand All @@ -180,8 +182,10 @@ public function create(BaseUriCollection $baseUris, QueueInterface $queue, array
*/
public function createFromJobId(string $jobId, QueueInterface $queue, array $selectedSubscribers, array $clientOptions = []): Escargot
{
$escargot = Escargot::createFromJobId($jobId, $queue, $this->createHttpClient($clientOptions));
$escargot = $escargot->withUserAgent(self::USER_AGENT);
$escargot = Escargot::createFromJobId($jobId, $queue)
->withHttpClient($this->createHttpClient($clientOptions))
->withUserAgent(self::USER_AGENT)
;

$this->registerDefaultSubscribers($escargot);
$this->registerSubscribers($escargot, $this->validateSubscribers($selectedSubscribers));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class BrokenLinkCheckerSubscriber implements EscargotSubscriberInterface, Escarg
use LoggerAwareTrait;
use SubscriberLoggerTrait;

public const TAG_SKIP = 'skip-broken-link-checker';

/**
* @var TranslatorInterface
*/
Expand All @@ -55,6 +57,16 @@ public function getName(): string

public function shouldRequest(CrawlUri $crawlUri): string
{
if ($crawlUri->hasTag(self::TAG_SKIP)) {
$this->logWithCrawlUri(
$crawlUri,
LogLevel::DEBUG,
'Did not check because it was marked to be skipped using the data-skip-broken-link-checker attribute.'
);

return SubscriberInterface::DECISION_NEGATIVE;
}

// Only check URIs that are part of our base collection or were found on one
$fromBaseUriCollection = $this->escargot->getBaseUris()->containsHost($crawlUri->getUri()->getHost());

Expand Down
4 changes: 4 additions & 0 deletions core-bundle/src/Crawl/Monolog/CrawlCsvLogHandler.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

class CrawlCsvLogHandler extends StreamHandler
{
public const DATETIME_FORMAT = 'Y-m-d H:i:s.u';

/**
* @var string
*/
Expand Down Expand Up @@ -52,6 +54,7 @@ protected function streamWrite($resource, array $record): void

if (0 === $size) {
fputcsv($resource, [
'Time',
'Source',
'URI',
'Found on URI',
Expand All @@ -62,6 +65,7 @@ protected function streamWrite($resource, array $record): void
}

$columns = [
$record['datetime']->format(self::DATETIME_FORMAT),
$record['context']['source'],
null === $crawlUri ? '---' : (string) $crawlUri->getUri(),
null === $crawlUri ? '---' : (string) $crawlUri->getFoundOn(),
Expand Down
12 changes: 9 additions & 3 deletions core-bundle/tests/Command/CrawlCommandTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ public function testOptionsConfigureEscargotCorrectly(): void
$client = new MockHttpClient();

// Test defaults
$escargot = Escargot::create($this->createBaseUriCollection(), new InMemoryQueue(), $client);
$escargot = Escargot::create($this->createBaseUriCollection(), new InMemoryQueue())
->withHttpClient($client)
;
$escargotFactory = $this->createValidEscargotFactory($escargot);
$command = new CrawlCommand($escargotFactory, new Filesystem());

Expand All @@ -71,7 +73,9 @@ public function testOptionsConfigureEscargotCorrectly(): void
$this->assertSame(0, $command->getEscargot()->getMaxDepth());

// Test options
$escargot = Escargot::create($this->createBaseUriCollection(), new InMemoryQueue(), $client);
$escargot = Escargot::create($this->createBaseUriCollection(), new InMemoryQueue())
->withHttpClient($client)
;
$escargotFactory = $this->createValidEscargotFactory($escargot);
$command = new CrawlCommand($escargotFactory, new Filesystem());

Expand All @@ -92,7 +96,9 @@ public function testEmitsWarningIfLocalhostIsInCollection(): void

$baseUriCollection = new BaseUriCollection([new Uri('http://localhost')]);

$escargot = Escargot::create($baseUriCollection, new InMemoryQueue(), $client);
$escargot = Escargot::create($baseUriCollection, new InMemoryQueue())
->withHttpClient($client)
;
$escargotFactory = $this->createValidEscargotFactory($escargot, $baseUriCollection);
$command = new CrawlCommand($escargotFactory, new Filesystem());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ public function shouldRequestProvider(): \Generator
(new CrawlUri(new Uri('https://github.com'), 0, true)),
];

yield 'Test skips URIs that were marked to be skipped by the data attribue' => [
(new CrawlUri(new Uri('https://github.com/foobar'), 1, false, new Uri('https://github.com')))->addTag(BrokenLinkCheckerSubscriber::TAG_SKIP),
SubscriberInterface::DECISION_NEGATIVE,
LogLevel::DEBUG,
'Did not check because it was marked to be skipped using the data-skip-broken-link-checker attribute.',
(new CrawlUri(new Uri('https://github.com'), 0, true)),
];

yield 'Test requests if everything is okay' => [
(new CrawlUri(new Uri('https://contao.org/foobar'), 0)),
SubscriberInterface::DECISION_POSITIVE,
Expand Down
35 changes: 24 additions & 11 deletions core-bundle/tests/Crawl/Monolog/CrawlCsvLogHandlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class CrawlCsvLogHandlerTest extends TestCase
/**
* @dataProvider writesCsvStreamProvider
*/
public function testWritesCsvStream(array $context, string $expectedContent, string $existingCsvContent = '', $message = 'foobar'): void
public function testWritesCsvStream(\DateTime $dt, array $context, string $expectedContent, string $existingCsvContent = '', $message = 'foobar'): void
{
$stream = fopen('php://memory', 'r+');

Expand All @@ -32,7 +32,7 @@ public function testWritesCsvStream(array $context, string $expectedContent, str
}

$handler = new CrawlCsvLogHandler($stream);
$handler->handle(['level' => Logger::DEBUG, 'message' => $message, 'extra' => [], 'context' => $context]);
$handler->handle(['level' => Logger::DEBUG, 'message' => $message, 'extra' => [], 'context' => $context, 'datetime' => $dt]);

rewind($stream);
$content = stream_get_contents($stream);
Expand All @@ -42,10 +42,14 @@ public function testWritesCsvStream(array $context, string $expectedContent, str

public function testSourceFilter(): void
{
$dt = new \DateTime();
$formattedDt = '"'.$dt->format(CrawlCsvLogHandler::DATETIME_FORMAT).'"';

$record = [
'level' => Logger::DEBUG,
'message' => 'foobar',
'extra' => [],
'datetime' => $dt,
'context' => [
'source' => 'source',
'crawlUri' => new CrawlUri(new Uri('https://contao.org'), 0),
Expand All @@ -68,42 +72,51 @@ public function testSourceFilter(): void
rewind($stream);
$content = stream_get_contents($stream);

$this->assertSame('Source,URI,"Found on URI","Found on level",Tags,Message'."\n".'source,https://contao.org/,,0,,foobar'."\n", $content);
$this->assertSame('Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n".$formattedDt.',source,https://contao.org/,,0,,foobar'."\n", $content);
}

public function writesCsvStreamProvider(): \Generator
{
$dt = new \DateTime();
$formattedDt = '"'.$dt->format(CrawlCsvLogHandler::DATETIME_FORMAT).'"';

yield 'Should not write anything if the source is missing' => [
$dt,
[],
'',
];

yield 'Correctly logs with no CrawlUri provided and empty log file (should write headlines)' => [
$dt,
['source' => 'source'],
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n".'source,---,---,---,---,foobar'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n".$formattedDt.',source,---,---,---,---,foobar'."\n",
];

yield 'Correctly logs with CrawlUri provided and empty log file (should write headlines)' => [
$dt,
['source' => 'source', 'crawlUri' => new CrawlUri(new Uri('https://contao.org'), 0)],
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n".'source,https://contao.org/,,0,,foobar'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n".$formattedDt.',source,https://contao.org/,,0,,foobar'."\n",
];

yield 'Correctly logs with no CrawlUri provided and a non-empty log file (should not write headlines)' => [
$dt,
['source' => 'source'],
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n".'source,---,---,---,---,foobar'."\n",
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n".$formattedDt.',source,---,---,---,---,foobar'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n",
];

yield 'Correctly logs with CrawlUri provided and a non-empty log file (should not write headlines)' => [
$dt,
['source' => 'source', 'crawlUri' => new CrawlUri(new Uri('https://contao.org'), 0)],
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n".'source,https://contao.org/,,0,,foobar'."\n",
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n".$formattedDt.',source,https://contao.org/,,0,,foobar'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n",
];

yield 'Correctly logs with new lines in message' => [
$dt,
['source' => 'source', 'crawlUri' => new CrawlUri(new Uri('https://contao.org'), 0)],
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n".'source,https://contao.org/,,0,,"foobar with new lines"'."\n",
'Source,URI,"Found on URI","Found on level",Tags,Message'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n".$formattedDt.',source,https://contao.org/,,0,,"foobar with new lines"'."\n",
'Time,Source,URI,"Found on URI","Found on level",Tags,Message'."\n",
"foobar\rwith\nnew\r\nlines",
];
}
Expand Down