Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delete search index entries under more specific conditions #7266

Merged
merged 9 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 40 additions & 4 deletions core-bundle/src/EventListener/SearchIndexListener.php
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,14 @@ public function __invoke(TerminateEvent $event): void

$document = Document::createFromRequestResponse($request, $response);
$needsIndex = $this->needsIndex($request, $response, $document);
$needsDelete = $this->needsDelete($request, $response, $document);

try {
$success = $event->getResponse()->isSuccessful();

if ($needsIndex && $success && $this->enabledFeatures & self::FEATURE_INDEX) {
if ($needsIndex && $this->enabledFeatures & self::FEATURE_INDEX) {
$this->indexer->index($document);
}

if (!$success && $this->enabledFeatures & self::FEATURE_DELETE) {
if ($needsDelete && $this->enabledFeatures & self::FEATURE_DELETE) {
$this->indexer->delete($document);
}
} catch (IndexerException $e) {
Expand All @@ -77,6 +76,11 @@ public function __invoke(TerminateEvent $event): void

private function needsIndex(Request $request, Response $response, Document $document): bool
{
// Do not index if response was not successful
if (!$response->isSuccessful()) {
return false;
}

// Do not index if called by crawler
if (Factory::USER_AGENT === $request->headers->get('User-Agent')) {
return false;
Expand Down Expand Up @@ -108,4 +112,36 @@ private function needsIndex(Request $request, Response $response, Document $docu
// If there are no json ld scripts at all, this should not be handled by our indexer
return 0 !== \count($lds);
}

private function needsDelete(Request $request, Response $response, Document $document): bool
fritzmg marked this conversation as resolved.
Show resolved Hide resolved
{
// Always delete on 404 and 410 responses
if (\in_array($response->getStatusCode(), [Response::HTTP_NOT_FOUND, Response::HTTP_GONE], true)) {
return true;
}

fritzmg marked this conversation as resolved.
Show resolved Hide resolved
// Ignore the following conditions if the response is not successful and do not delete.
if (!$response->isSuccessful()) {
return false;
}

// Delete if the X-Robots-Tag header contains "noindex"
if (false !== strpos($response->headers->get('X-Robots-Tag', ''), 'noindex')) {
leofeyer marked this conversation as resolved.
Show resolved Hide resolved
return true;
}

try {
$robots = $document->getContentCrawler()->filterXPath('//head/meta[@name="robots"]')->first()->attr('content');

// Delete if the meta robots tag contains "noindex"
if (false !== strpos($robots, 'noindex')) {
return true;
}
} catch (\Exception $e) {
// No meta robots tag found
}

// Otherwise do not delete
return false;
}
}
52 changes: 43 additions & 9 deletions core-bundle/tests/EventListener/SearchIndexListenerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -99,47 +99,81 @@ public function getRequestResponse(): \Generator
false,
];

yield 'Should be deleted because the response was not successful (404)' => [
yield 'Should be deleted because the response was "not found" (404)' => [
Request::create('/foobar'),
new Response('', 404),
SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX,
false,
true,
];

yield 'Should be deleted because the response was not successful (403)' => [
yield 'Should be deleted because the response was "gone" (410)' => [
Request::create('/foobar'),
new Response('', 403),
new Response('', 404),
SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX,
false,
true,
];

yield 'Should not be deleted because even though the response was not successful (403), it was disabled by the feature flag ' => [
yield 'Should not be deleted because even though the response was "not found" (404), it was disabled by the feature flag ' => [
Request::create('/foobar'),
new Response('<html><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 403),
new Response('<html><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 404),
SearchIndexListener::FEATURE_INDEX,
false,
false,
];

$response = new Response('<html><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 403);
$response = new Response('<html><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 200);
$response->headers->set('X-Robots-Tag', 'noindex');

yield 'Should not be handled because the X-Robots-Tag header contains "noindex" ' => [
yield 'Should not index but should delete because the X-Robots-Tag header contains "noindex" ' => [
fritzmg marked this conversation as resolved.
Show resolved Hide resolved
Request::create('/foobar'),
$response,
SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX,
false,
true,
];

yield 'Should not be handled because the meta robots tag contains "noindex" ' => [
$response = new Response('<html><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 500);
$response->headers->set('X-Robots-Tag', 'noindex');

yield 'Should not index and delete because the X-Robots-Tag header contains "noindex" and response is unsuccesful ' => [
fritzmg marked this conversation as resolved.
Show resolved Hide resolved
Request::create('/foobar'),
new Response('<html><head><meta name="robots" content="noindex,nofollow"/></head><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 403),
$response,
SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX,
false,
false,
];

yield 'Should not index but should delete because the meta robots tag contains "noindex" ' => [
fritzmg marked this conversation as resolved.
Show resolved Hide resolved
Request::create('/foobar'),
new Response('<html><head><meta name="robots" content="noindex,nofollow"/></head><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 200),
SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX,
false,
true,
];

yield 'Should not index and delete because the meta robots tag contains "noindex" and response is unsuccesful ' => [
fritzmg marked this conversation as resolved.
Show resolved Hide resolved
Request::create('/foobar'),
new Response('<html><head><meta name="robots" content="noindex,nofollow"/></head><body><script type="application/ld+json">{"@context":"https:\/\/contao.org\/","@type":"Page","pageId":2,"noSearch":false,"protected":false,"groups":[],"fePreview":false}</script></body></html>', 500),
SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX,
false,
false,
];

// From the unsucessful responses only the 404 and 410 status codes should execute a deletion.
for ($status = 400; $status < 600; ++$status) {
if (\in_array($status, [Response::HTTP_NOT_FOUND, Response::HTTP_GONE], true)) {
continue;
}

yield 'Should be skipped because the response status ('.$status.') is not successful and not a "not found" or "gone" response' => [
Request::create('/foobar'),
new Response('', $status),
SearchIndexListener::FEATURE_DELETE | SearchIndexListener::FEATURE_INDEX,
false,
false,
];
}
}
}