Skip to content

Commit

Permalink
Implement the contao:crawl command (see #985)
Browse files Browse the repository at this point in the history
Description
-----------

This is a sub PR of #981 only containing the CLI variant. It will allow us to focus more on the technical implementation first before we have all UI/CSS changes.

Commits
-------

7bcce12 Implemented contao:crawl command
c305881 Removed left-over controller
35e68cc Added unit tests for the Escargot factory
9a2f0ea CS
772ed1f Added tests for the subscriber result and fixed issues found along the way
0c8d30b Added unit tests for the searchindexsubscriber
ceff452 CS
e52c855 Added unit test for the EscargotSubscriberPass
749e00b Added unit tests for the extension as well as configuration
10f18aa CS
281e43b Adjusted to latest Escargot changes
452dfcc Added more command options and unit tests for the command
7d22b89 Merge branch 'master' into feature/crawl-command
072be02 CS
b990477 Catch the new indexer exceptions in SearchIndexListener
06b6d9a Updated the deps
7d2df71 Fix the coding style
63b0fdf CS
369ad57 Merge branch 'master' into feature/crawl-command
2d95608 Add unit tests for the service definitions
e714b07 Correctly name the interface
  • Loading branch information
Toflar authored and leofeyer committed Dec 4, 2019
1 parent dd4ef4b commit 23630c1
Show file tree
Hide file tree
Showing 26 changed files with 1,674 additions and 21 deletions.
2 changes: 2 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
"phpspec/php-diff": "^1.0",
"phpunit/php-token-stream": "^1.4 || ^2.0 || ^3.0",
"psr/log": "^1.0",
"ramsey/uuid": "^3.8",
"scheb/two-factor-bundle": "^4.7",
"scssphp/scssphp": "^1.0",
"sensiolabs/ansi-to-html": "^1.1",
Expand Down Expand Up @@ -109,6 +110,7 @@
"symfony/var-dumper": "4.4.*",
"symfony/web-profiler-bundle": "4.4.*",
"symfony/yaml": "4.4.*",
"terminal42/escargot": "^0.1",
"terminal42/service-annotation-bundle": "^1.0",
"toflar/psr6-symfony-http-cache-store": "^2.1",
"true/punycode": "^2.1",
Expand Down
2 changes: 2 additions & 0 deletions core-bundle/composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"phpunit/php-token-stream": "^1.4 || ^2.0 || ^3.0",
"psr/log": "^1.0",
"scheb/two-factor-bundle": "^4.7",
"ramsey/uuid": "^3.8",
"scssphp/scssphp": "^1.0",
"simplepie/simplepie": "^1.3",
"spomky-labs/otphp": "^9.1",
Expand All @@ -95,6 +96,7 @@
"symfony/twig-bundle": "4.4.*",
"symfony/var-dumper": "4.4.*",
"symfony/yaml": "4.4.*",
"terminal42/escargot": "^0.1",
"terminal42/service-annotation-bundle": "^1.0",
"true/punycode": "^2.1",
"twig/twig": "^2.7",
Expand Down
212 changes: 212 additions & 0 deletions core-bundle/src/Command/CrawlCommand.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
<?php

declare(strict_types=1);

/*
* This file is part of Contao.
*
* (c) Leo Feyer
*
* @license LGPL-3.0-or-later
*/

namespace Contao\CoreBundle\Command;

use Contao\CoreBundle\Search\Escargot\Factory;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Helper\ProgressBar;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Logger\ConsoleLogger;
use Symfony\Component\Console\Output\ConsoleOutput;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use Symfony\Contracts\HttpClient\ChunkInterface;
use Symfony\Contracts\HttpClient\ResponseInterface;
use Terminal42\Escargot\CrawlUri;
use Terminal42\Escargot\Escargot;
use Terminal42\Escargot\EscargotAwareInterface;
use Terminal42\Escargot\EscargotAwareTrait;
use Terminal42\Escargot\Exception\InvalidJobIdException;
use Terminal42\Escargot\Queue\InMemoryQueue;
use Terminal42\Escargot\Subscriber\FinishedCrawlingSubscriberInterface;
use Terminal42\Escargot\Subscriber\SubscriberInterface;

class CrawlCommand extends Command
{
/**
* @var Factory
*/
private $escargotFactory;

/**
* @var Escargot
*/
private $escargot;

public function __construct(Factory $escargotFactory)
{
$this->escargotFactory = $escargotFactory;

parent::__construct();
}

public function getEscargot(): Escargot
{
return $this->escargot;
}

/**
* {@inheritdoc}
*/
protected function configure(): void
{
$this
->setName('contao:crawl')
->addArgument('job', InputArgument::OPTIONAL, 'An optional existing job ID')
->addOption('subscribers', 's', InputOption::VALUE_REQUIRED | InputOption::VALUE_IS_ARRAY, 'A list of subscribers to enable.', $this->escargotFactory->getSubscriberNames())
->addOption('concurrency', 'c', InputOption::VALUE_REQUIRED, 'The number of concurrent requests that are going to be executed.', 10)
->addOption('delay', null, InputOption::VALUE_REQUIRED, 'The number of microseconds to wait between requests. (0 = throttling is disabled)', 0)
->addOption('max-requests', null, InputOption::VALUE_REQUIRED, 'The maximum number of requests to execute. (0 = no limit)', 0)
->addOption('max-depth', null, InputOption::VALUE_REQUIRED, 'The maximum depth to crawl for. (0 = no limit)', 0)
->addOption('no-progress', null, InputOption::VALUE_NONE, 'Disables the progess bar output')
->setDescription('Crawls all Contao root pages plus additional URIs configured using (contao.search.additional_uris) and triggers the desired subscribers.')
;
}

/**
* {@inheritdoc}
*/
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
$io->title('Contao Crawler');

$subscribers = $input->getOption('subscribers');
$queue = new InMemoryQueue();
$baseUris = $this->escargotFactory->getSearchUriCollection();

try {
if ($jobId = $input->getArgument('job')) {
$this->escargot = $this->escargotFactory->createFromJobId($jobId, $queue, $subscribers);
} else {
$this->escargot = $this->escargotFactory->create($baseUris, $queue, $subscribers);
}
} catch (InvalidJobIdException $e) {
$io->error('Could not find the given job ID.');

return 1;
} catch (\InvalidArgumentException $e) {
$io->error($e->getMessage());

return 1;
}

$logOutput = $output instanceof ConsoleOutput ? $output->section() : $output;

$this->escargot = $this->escargot->withLogger($this->createSourceProvidingConsoleLogger($logOutput));
$this->escargot = $this->escargot->withConcurrency((int) $input->getOption('concurrency'));
$this->escargot = $this->escargot->withRequestDelay((int) $input->getOption('delay'));
$this->escargot = $this->escargot->withMaxRequests((int) $input->getOption('max-requests'));
$this->escargot = $this->escargot->withMaxDepth((int) $input->getOption('max-depth'));

$io->comment('Started crawling...');

if (!$input->getOption('no-progress')) {
$this->addProgressBar($output);
}

$this->escargot->crawl();

$output->writeln('');
$output->writeln('');
$io->comment('Finished crawling! Find the details for each subscriber below:');

$errored = false;

foreach ($this->escargotFactory->getSubscribers($subscribers) as $subscriber) {
$io->section($subscriber->getName());
$result = $subscriber->getResult();

if ($result->wasSuccessful()) {
$io->success($result->getSummary());
} else {
$io->error($result->getSummary());
$errored = true;
}

if ($result->getWarning()) {
$io->warning($result->getWarning());
}
}

return (int) $errored;
}

private function createSourceProvidingConsoleLogger(OutputInterface $output): ConsoleLogger
{
return new class($output) extends ConsoleLogger {
public function log($level, $message, array $context = []): void
{
parent::log($level, '[{source}] '.$message, $context);
}
};
}

private function addProgressBar(OutputInterface $output): void
{
$processOutput = $output instanceof ConsoleOutput ? $output->section() : $output;

$progressBar = new ProgressBar($processOutput);
$progressBar->setFormat("%title%\n%current%/%max% [%bar%] %percent:3s%%");
$progressBar->setMessage('Starting to crawl...', 'title');
$progressBar->start();

$this->escargot->addSubscriber($this->getProgressSubscriber($progressBar));
}

private function getProgressSubscriber(ProgressBar $progressBar): SubscriberInterface
{
return new class($progressBar) implements SubscriberInterface, EscargotAwareInterface, FinishedCrawlingSubscriberInterface {
use EscargotAwareTrait;

/**
* @var ProgressBar
*/
private $progressBar;

public function __construct(ProgressBar $progressBar)
{
$this->progressBar = $progressBar;
}

public function shouldRequest(CrawlUri $crawlUri): string
{
// We advance with every shouldRequest() call to update the progress bar frequently enough
$this->progressBar->advance();
$this->progressBar->setMaxSteps($this->escargot->getQueue()->countAll($this->escargot->getJobId()));

return SubscriberInterface::DECISION_ABSTAIN;
}

public function needsContent(CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk): string
{
return SubscriberInterface::DECISION_ABSTAIN;
}

public function onLastChunk(CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk): void
{
// We only update the message here, otherwise too many nonsense URIs will be shown
$this->progressBar->setMessage((string) $crawlUri->getUri(), 'title');
}

public function finishedCrawling(): void
{
$this->progressBar->setMessage('Done!', 'title');
$this->progressBar->finish();
$this->progressBar->display();
}
};
}
}
2 changes: 2 additions & 0 deletions core-bundle/src/ContaoCoreBundle.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
use Contao\CoreBundle\DependencyInjection\Compiler\AddResourcesPathsPass;
use Contao\CoreBundle\DependencyInjection\Compiler\AddSessionBagsPass;
use Contao\CoreBundle\DependencyInjection\Compiler\DataContainerCallbackPass;
use Contao\CoreBundle\DependencyInjection\Compiler\EscargotSubscriberPass;
use Contao\CoreBundle\DependencyInjection\Compiler\MakeServicesPublicPass;
use Contao\CoreBundle\DependencyInjection\Compiler\MapFragmentsToGlobalsPass;
use Contao\CoreBundle\DependencyInjection\Compiler\PickerProviderPass;
Expand Down Expand Up @@ -83,5 +84,6 @@ public function build(ContainerBuilder $container): void
$container->addCompilerPass(new TranslationDataCollectorPass());
$container->addCompilerPass(new RegisterHookListenersPass(), PassConfig::TYPE_OPTIMIZE);
$container->addCompilerPass(new SearchIndexerPass());
$container->addCompilerPass(new EscargotSubscriberPass());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?php

declare(strict_types=1);

/*
* This file is part of Contao.
*
* (c) Leo Feyer
*
* @license LGPL-3.0-or-later
*/

namespace Contao\CoreBundle\DependencyInjection\Compiler;

use Symfony\Component\DependencyInjection\Compiler\CompilerPassInterface;
use Symfony\Component\DependencyInjection\Compiler\PriorityTaggedServiceTrait;
use Symfony\Component\DependencyInjection\ContainerBuilder;

class EscargotSubscriberPass implements CompilerPassInterface
{
use PriorityTaggedServiceTrait;

/**
* {@inheritdoc}
*/
public function process(ContainerBuilder $container): void
{
if (!$container->has('contao.search.escargot_factory')) {
return;
}

$definition = $container->findDefinition('contao.search.escargot_factory');
$references = $this->findAndSortTaggedServices('contao.escargot_subscriber', $container);

foreach ($references as $reference) {
$definition->addMethodCall('addSubscriber', [$reference]);
}
}
}
35 changes: 35 additions & 0 deletions core-bundle/src/DependencyInjection/Configuration.php
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ function (string $value): string {
->append($this->addImageNode())
->append($this->addSecurityNode())
->append($this->addSearchNode())
->append($this->addCrawlNode())
->end()
;

Expand Down Expand Up @@ -357,6 +358,40 @@ private function addSearchNode(): NodeDefinition
;
}

private function addCrawlNode(): NodeDefinition
{
return (new TreeBuilder('crawl'))
->getRootNode()
->addDefaultsIfNotSet()
->children()
->arrayNode('additionalURIs')
->info('Additional URIs to crawl (by default, only the ones defined in the root pages are crawled).')
->validate()
->ifTrue(
static function (array $uris): bool {
foreach ($uris as $uri) {
if (!preg_match('@^https?://@', $uri)) {
return true;
}

return false;
}
}
)
->thenInvalid('All provided additional URIs must start with either http:// or https://.')
->end()
->prototype('scalar')->end()
->defaultValue([])
->end()
->arrayNode('defaultHttpClientOptions')
->info('Allows to configure the default HttpClient options (useful for proxy settings, SSL certificate validation and more).')
->prototype('scalar')->end()
->defaultValue([])
->end()
->end()
;
}

/**
* Canonicalizes a path preserving the directory separators.
*/
Expand Down
18 changes: 18 additions & 0 deletions core-bundle/src/DependencyInjection/ContaoCoreExtension.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

use Contao\CoreBundle\EventListener\SearchIndexListener;
use Contao\CoreBundle\Picker\PickerProviderInterface;
use Contao\CoreBundle\Search\Escargot\Subscriber\EscargotSubscriberInterface;
use Contao\CoreBundle\Search\Indexer\IndexerInterface;
use Imagine\Exception\RuntimeException;
use Imagine\Gd\Imagine;
Expand Down Expand Up @@ -88,6 +89,7 @@ public function load(array $configs, ContainerBuilder $container): void
}

$this->handleSearchConfig($config, $container);
$this->handleCrawlConfig($config, $container);
$this->setPredefinedImageSizes($config, $container);
$this->setImagineService($config, $container);
$this->overwriteImageTargetDir($config, $container);
Expand Down Expand Up @@ -137,6 +139,22 @@ private function handleSearchConfig(array $config, ContainerBuilder $container):
}
}

private function handleCrawlConfig(array $config, ContainerBuilder $container): void
{
$container
->registerForAutoconfiguration(EscargotSubscriberInterface::class)
->addTag('contao.escargot_subscriber')
;

if (!$container->hasDefinition('contao.search.escargot_factory')) {
return;
}

$factory = $container->getDefinition('contao.search.escargot_factory');
$factory->setArgument(2, $config['crawl']['additionalURIs']);
$factory->setArgument(3, $config['crawl']['defaultHttpClientOptions']);
}

/**
* Validates and sets the "contao.image.sizes" parameter.
*/
Expand Down
Loading

0 comments on commit 23630c1

Please sign in to comment.