From 17d0e8a79db6f49c36ef0d9d233211314efc6c05 Mon Sep 17 00:00:00 2001 From: Leo Feyer Date: Tue, 8 Dec 2020 11:21:10 +0100 Subject: [PATCH] Do not index preview URLs for searching --- .../src/EventListener/SearchIndexListener.php | 21 ++++++++++++++++++- .../EventListener/SearchIndexListenerTest.php | 19 +++++++++++++++++ .../src/Resources/skeleton/web/preview.php | 3 +++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/core-bundle/src/EventListener/SearchIndexListener.php b/core-bundle/src/EventListener/SearchIndexListener.php index bcf5c7fa37c..0a95b25a487 100644 --- a/core-bundle/src/EventListener/SearchIndexListener.php +++ b/core-bundle/src/EventListener/SearchIndexListener.php @@ -71,7 +71,26 @@ public function __invoke(TerminateEvent $event): void return; } - $document = Document::createFromRequestResponse($request, $event->getResponse()); + $response = $event->getResponse(); + + // Do not index if the X-Robots-Tag header contains "noindex" + if (false !== strpos($response->headers->get('X-Robots-Tag', ''), 'noindex')) { + return; + } + + $document = Document::createFromRequestResponse($request, $response); + + try { + $robots = $document->getContentCrawler()->filterXPath('//head/meta[@name="robots"]')->first()->attr('content'); + + // Do not index if the meta robots tag contains "noindex" + if (false !== strpos($robots, 'noindex')) { + return; + } + } catch (\Exception $e) { + // No meta robots tag found + } + $lds = $document->extractJsonLdScripts(); // If there are no json ld scripts at all, this should not be handled by our indexer diff --git a/core-bundle/tests/EventListener/SearchIndexListenerTest.php b/core-bundle/tests/EventListener/SearchIndexListenerTest.php index f367eac8417..551ad7243c9 100644 --- a/core-bundle/tests/EventListener/SearchIndexListenerTest.php +++ b/core-bundle/tests/EventListener/SearchIndexListenerTest.php @@ -121,5 +121,24 @@ public function getRequestResponse(): \Generator false, false, ]; + + $response = new Response('', 403); + $response->headers->set('X-Robots-Tag', 'noindex'); + + yield 'Should not be handled because the X-Robots-Tag header contains "noindex" ' => [ + Request::create('/foobar'), + $response, + SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX, + false, + false, + ]; + + yield 'Should not be handled because the meta robots tag contains "noindex" ' => [ + Request::create('/foobar'), + new Response('', 403), + SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX, + false, + false, + ]; } } diff --git a/manager-bundle/src/Resources/skeleton/web/preview.php b/manager-bundle/src/Resources/skeleton/web/preview.php index ab329196c96..a4f0467a442 100644 --- a/manager-bundle/src/Resources/skeleton/web/preview.php +++ b/manager-bundle/src/Resources/skeleton/web/preview.php @@ -30,6 +30,9 @@ $kernel = ContaoKernel::fromRequest(\dirname(__DIR__), $request); $response = $kernel->handle($request); +// Prevent preview URLs from being indexed +$response->headers->set('X-Robots-Tag', 'noindex'); + // Force no-cache on all responses in the preview front controller $response->headers->set('Cache-Control', 'no-store');