From 73a2770e2d3535ec9f1b03d54be00e56ebb8ff16 Mon Sep 17 00:00:00 2001 From: Yanick Witschi Date: Tue, 9 Apr 2024 07:19:52 +0200 Subject: [PATCH] Merge pull request from GHSA-9jh5-qf84-x6pr * Ensure confidential headers are not leaked to external URIs when crawling * CS --------- Co-authored-by: Leo Feyer <1192057+leofeyer@users.noreply.github.com> --- core-bundle/src/Crawl/Escargot/Factory.php | 92 ++++++++++++++++--- .../ContaoCoreExtension.php | 4 +- core-bundle/src/Resources/config/services.yml | 1 + .../src/Resources/contao/classes/Crawl.php | 5 +- .../tests/Crawl/Escargot/FactoryTest.php | 90 +++++++++++++++++- .../ContaoCoreExtensionTest.php | 4 +- 6 files changed, 174 insertions(+), 22 deletions(-) diff --git a/core-bundle/src/Crawl/Escargot/Factory.php b/core-bundle/src/Crawl/Escargot/Factory.php index 547e0e3a25f..0b48413a2e7 100644 --- a/core-bundle/src/Crawl/Escargot/Factory.php +++ b/core-bundle/src/Crawl/Escargot/Factory.php @@ -17,7 +17,10 @@ use Contao\PageModel; use Doctrine\DBAL\Connection; use Nyholm\Psr7\Uri; +use Psr\Http\Message\UriInterface; use Symfony\Component\HttpClient\HttpClient; +use Symfony\Component\HttpClient\ScopingHttpClient; +use Symfony\Component\HttpFoundation\RequestStack; use Symfony\Component\Uid\Uuid; use Symfony\Contracts\HttpClient\HttpClientInterface; use Terminal42\Escargot\BaseUriCollection; @@ -36,6 +39,7 @@ class Factory private Connection $connection; private ContaoFramework $framework; + private RequestStack $requestStack; private array $defaultHttpClientOptions; /** @@ -48,12 +52,19 @@ class Factory */ private array $subscribers = []; - public function __construct(Connection $connection, ContaoFramework $framework, array $additionalUris = [], array $defaultHttpClientOptions = []) + /** + * @var \Closure(array): HttpClientInterface|null + */ + private ?\Closure $httpClientFactory; + + public function __construct(Connection $connection, ContaoFramework $framework, RequestStack $requestStack, array $additionalUris = [], array $defaultHttpClientOptions = [], \Closure $httpClientFactory = null) { $this->connection = $connection; $this->framework = $framework; + $this->requestStack = $requestStack; $this->additionalUris = $additionalUris; $this->defaultHttpClientOptions = $defaultHttpClientOptions; + $this->httpClientFactory = $httpClientFactory ?? static fn (array $defaultOptions) => HttpClient::create($defaultOptions); } public function addSubscriber(EscargotSubscriberInterface $subscriber): self @@ -163,18 +174,77 @@ public function createFromJobId(string $jobId, QueueInterface $queue, array $sel private function createHttpClient(array $options = []): HttpClientInterface { - return HttpClient::create( - array_merge_recursive( - [ - 'headers' => [ - 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'user-agent' => self::USER_AGENT, - ], - 'max_duration' => 10, // Ignore requests that take longer than 10 seconds + $options = array_merge_recursive( + [ + 'headers' => [ + 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'user-agent' => self::USER_AGENT, ], - array_merge_recursive($this->getDefaultHttpClientOptions(), $options) - ) + 'max_duration' => 10, // Ignore requests that take longer than 10 seconds + ], + array_merge_recursive($this->getDefaultHttpClientOptions(), $options) ); + + // Make sure confidential headers force a scoped client so external domains do not leak data + $cleanOptions = $this->cleanOptionsFromConfidentialData($options); + + if ($options === $cleanOptions) { + return ($this->httpClientFactory)($options); + } + + $scopedOptionsByRegex = []; + + // All options including the confidential headers for our root page collection + foreach ($this->getRootPageUriCollection()->all() as $rootPageUri) { + $scopedOptionsByRegex[preg_quote($this->getOriginFromUri($rootPageUri))] = $options; + } + + // Closing the session is necessary here as otherwise we might run into our own session lock + $request = $this->requestStack->getMainRequest(); + + if ($request && $request->hasSession()) { + $request->getSession()->save(); + } + + return new ScopingHttpClient(($this->httpClientFactory)($cleanOptions), $scopedOptionsByRegex); + } + + private function getOriginFromUri(UriInterface $uri): string + { + $origin = $uri->getScheme().'://'.$uri->getHost(); + + if ($uri->getPort()) { + $origin .= ':'.$uri->getPort(); + } + + return $origin.'/'; + } + + private function cleanOptionsFromConfidentialData(array $options): array + { + $cleanOptions = []; + + foreach ($options as $k => $v) { + if ('headers' === $k) { + foreach ($v as $header => $value) { + if (\in_array(strtolower($header), ['authorization', 'cookie'], true)) { + continue; + } + + $cleanOptions['headers'][$header] = $value; + } + + continue; + } + + if ('basic_auth' === $k || 'bearer_auth' === $k) { + continue; + } + + $cleanOptions[$k] = $v; + } + + return $cleanOptions; } private function registerDefaultSubscribers(Escargot $escargot): void diff --git a/core-bundle/src/DependencyInjection/ContaoCoreExtension.php b/core-bundle/src/DependencyInjection/ContaoCoreExtension.php index c17057103f4..0ef3a2fcb02 100644 --- a/core-bundle/src/DependencyInjection/ContaoCoreExtension.php +++ b/core-bundle/src/DependencyInjection/ContaoCoreExtension.php @@ -277,8 +277,8 @@ private function handleCrawlConfig(array $config, ContainerBuilder $container): } $factory = $container->getDefinition('contao.crawl.escargot.factory'); - $factory->setArgument(2, $config['crawl']['additional_uris']); - $factory->setArgument(3, $config['crawl']['default_http_client_options']); + $factory->setArgument(3, $config['crawl']['additional_uris']); + $factory->setArgument(4, $config['crawl']['default_http_client_options']); } /** diff --git a/core-bundle/src/Resources/config/services.yml b/core-bundle/src/Resources/config/services.yml index 2ff64a78e29..b49d5b490c3 100644 --- a/core-bundle/src/Resources/config/services.yml +++ b/core-bundle/src/Resources/config/services.yml @@ -82,6 +82,7 @@ services: arguments: - '@database_connection' - '@contao.framework' + - '@request_stack' contao.crawl.escargot.search_index_subscriber: class: Contao\CoreBundle\Crawl\Escargot\Subscriber\SearchIndexSubscriber diff --git a/core-bundle/src/Resources/contao/classes/Crawl.php b/core-bundle/src/Resources/contao/classes/Crawl.php index 8d98e14a7bf..14dad9472b2 100644 --- a/core-bundle/src/Resources/contao/classes/Crawl.php +++ b/core-bundle/src/Resources/contao/classes/Crawl.php @@ -125,12 +125,9 @@ public function run() } else { + // TODO: we need a way to authenticate with a token instead of our own cookie $session = System::getContainer()->get('session'); $clientOptions = array('headers' => array('Cookie' => sprintf('%s=%s', $session->getName(), $session->getId()))); - - // Closing the session is necessary here as otherwise we run into our own session lock - // TODO: we need a way to authenticate with a token instead of our own cookie - $session->save(); } } else diff --git a/core-bundle/tests/Crawl/Escargot/FactoryTest.php b/core-bundle/tests/Crawl/Escargot/FactoryTest.php index cfea1c57dff..e1f240d0e7b 100644 --- a/core-bundle/tests/Crawl/Escargot/FactoryTest.php +++ b/core-bundle/tests/Crawl/Escargot/FactoryTest.php @@ -18,6 +18,10 @@ use Contao\PageModel; use Doctrine\DBAL\Connection; use Nyholm\Psr7\Uri; +use Symfony\Component\HttpClient\MockHttpClient; +use Symfony\Component\HttpClient\Response\MockResponse; +use Symfony\Component\HttpClient\ScopingHttpClient; +use Symfony\Component\HttpFoundation\RequestStack; use Terminal42\Escargot\BaseUriCollection; use Terminal42\Escargot\Queue\InMemoryQueue; @@ -37,7 +41,7 @@ public function testHandlesSubscribersCorrectly(): void ->willReturn('subscriber-2') ; - $factory = new Factory($this->createMock(Connection::class), $this->mockContaoFramework()); + $factory = new Factory($this->createMock(Connection::class), $this->mockContaoFramework(), new RequestStack()); $factory->addSubscriber($subscriber1); $factory->addSubscriber($subscriber2); @@ -65,6 +69,7 @@ public function testBuildsUriCollectionsCorrectly(): void $factory = new Factory( $this->createMock(Connection::class), $this->mockContaoFramework([PageModel::class => $pageModelAdapter]), + new RequestStack(), ['https://example.com'] ); @@ -87,7 +92,7 @@ public function testCreatesEscargotCorrectlyWithNewJobId(): void ->willReturn('subscriber-1') ; - $factory = new Factory($this->createMock(Connection::class), $this->mockContaoFramework()); + $factory = new Factory($this->createMock(Connection::class), $this->mockContaoFramework(), new RequestStack()); $factory->addSubscriber($subscriber1); $uriCollection = new BaseUriCollection([new Uri('https://contao.org')]); @@ -110,7 +115,7 @@ public function testCreatesEscargotCorrectlyWithExistingJobId(): void ->willReturn('subscriber-1') ; - $factory = new Factory($this->createMock(Connection::class), $this->mockContaoFramework()); + $factory = new Factory($this->createMock(Connection::class), $this->mockContaoFramework(), new RequestStack()); $factory->addSubscriber($subscriber1); $queue = new InMemoryQueue(); @@ -126,4 +131,83 @@ public function testCreatesEscargotCorrectlyWithExistingJobId(): void $escargot = $factory->createFromJobId($jobId, $queue, ['subscriber-8']); $this->assertSame(Factory::USER_AGENT, $escargot->getUserAgent()); } + + public function testScopesConfidentialHeadersAutomatically(): void + { + $expectedRequests = [ + function (string $method, string $url, array $options): MockResponse { + $this->assertSame('GET', $method); + $this->assertSame('https://contao.org/robots.txt', $url); + $this->assertContains('Cookie: Confidential', $options['headers']); + $this->assertContains('Authorization: Basic dXNlcm5hbWU6cGFzc3dvcmQ=', $options['headers']); + + return new MockResponse(); + }, + function (string $method, string $url, array $options): MockResponse { + $this->assertSame('GET', $method); + $this->assertSame('https://contao.de/robots.txt', $url); + $this->assertContains('Cookie: Confidential', $options['headers']); + $this->assertContains('Authorization: Basic dXNlcm5hbWU6cGFzc3dvcmQ=', $options['headers']); + + return new MockResponse(); + }, + function (string $method, string $url, array $options): MockResponse { + $this->assertSame('GET', $method); + $this->assertSame('https://www.foreign-domain.com/robots.txt', $url); + $this->assertNotContains('Cookie: Confidential', $options['headers']); + $this->assertNotContains('Authorization: Basic dXNlcm5hbWU6cGFzc3dvcmQ=', $options['headers']); + + return new MockResponse(); + }, + ]; + + $mockClient = new MockHttpClient($expectedRequests); + $clientFactory = static fn (array $defaultOptions) => $mockClient; + + $rootPage1 = $this->createMock(PageModel::class); + $rootPage1 + ->method('getAbsoluteUrl') + ->willReturn('https://contao.org') + ; + + $rootPage2 = $this->createMock(PageModel::class); + $rootPage2 + ->method('getAbsoluteUrl') + ->willReturn('https://contao.de') + ; + + $pageModelAdapter = $this->mockAdapter(['findPublishedRootPages']); + $pageModelAdapter + ->method('findPublishedRootPages') + ->willReturn([$rootPage1, $rootPage2]) + ; + + $subscriber1 = $this->createMock(EscargotSubscriberInterface::class); + $subscriber1 + ->method('getName') + ->willReturn('subscriber-1') + ; + + $factory = new Factory( + $this->createMock(Connection::class), + $this->mockContaoFramework([PageModel::class => $pageModelAdapter]), + new RequestStack(), + ['https://www.foreign-domain.com'], + [ + 'headers' => [ + 'Cookie' => 'Confidential', + ], + 'auth_basic' => 'username:password', + ], + $clientFactory + ); + + $factory->addSubscriber($subscriber1); + + $escargot = $factory->create($factory->getCrawlUriCollection(), new InMemoryQueue(), ['subscriber-1']); + $escargot->crawl(); + + $this->assertSame(3, $mockClient->getRequestsCount()); + $this->assertInstanceOf(ScopingHttpClient::class, $escargot->getClient()); + } } diff --git a/core-bundle/tests/DependencyInjection/ContaoCoreExtensionTest.php b/core-bundle/tests/DependencyInjection/ContaoCoreExtensionTest.php index c3f7eb5d29f..97d2a6a6c00 100644 --- a/core-bundle/tests/DependencyInjection/ContaoCoreExtensionTest.php +++ b/core-bundle/tests/DependencyInjection/ContaoCoreExtensionTest.php @@ -338,8 +338,8 @@ public function testSetsTheCrawlOptionsOnTheEscargotFactory(): void $definition = $container->getDefinition('contao.crawl.escargot.factory'); - $this->assertSame(['https://example.com'], $definition->getArgument(2)); - $this->assertSame(['proxy' => 'http://localhost:7080', 'headers' => ['Foo' => 'Bar']], $definition->getArgument(3)); + $this->assertSame(['https://example.com'], $definition->getArgument(3)); + $this->assertSame(['proxy' => 'http://localhost:7080', 'headers' => ['Foo' => 'Bar']], $definition->getArgument(4)); } public function testConfiguresTheBackupManagerCorrectly(): void