Skip to content

Commit

Permalink
respected own_host_only settings, resolves #74
Browse files Browse the repository at this point in the history
  • Loading branch information
solverat committed Sep 12, 2018
1 parent b095ebc commit 4ae7ac0
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/00_Configuration_Values.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Here you'll find all the configuration possibilities, default values and also so
| enabled | bool | false | Enable and configure the search frontend if you want to include a full text search on your website. |
| fuzzy_search_results | bool | false | Fuzzy search results: When enabled, a fuzzy search is performed. The search will automatically include related terms. |
| search_suggestion | bool | true | Search suggestions: When enabled, a fuzzy search for similar search terms is performed. If no results could be found for the search term entered by the user, similar search terms are presented as suggestions. |
| own_host_only | bool | false | Own Host only: Check to limit search results to results from the current (sub-)domain only. |
| own_host_only | bool | true | Own Host only: Check to limit search results to results from the current seed (sub-)domain only. |
| seeds | array | [] | Start-Urls (Seeds): Specify start URLs for the crawler. Please enter with protocol! e.g. http://www.pimcore.org and enter a starting URL on your main domain first and any subdomains next, because the domain of the first URL will be used as the main domain for sitemap generation. |
| categories | service | ~ | If search results should be displayed by categories, please enter all valid categories here. The crawler sorts a page into a category if it contains a html meta tag with the name "lucene-search:categories". |
| filter:allow_query_in_url | bool | false | When checked, LuceneSearch will crawl urls with query fragments. |
Expand Down
2 changes: 1 addition & 1 deletion src/LuceneSearchBundle/Resources/config/pimcore/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ lucene_search:
fuzzy_search_results: false
search_suggestion: true

own_host_only: false
own_host_only: true

seeds: []
categories: ~
Expand Down
11 changes: 6 additions & 5 deletions src/LuceneSearchBundle/Task/Crawler/CrawlerTask.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
use LuceneSearchBundle\Task\Crawler\Event\Logger;
use LuceneSearchBundle\Task\Crawler\Event\Statistics;
use LuceneSearchBundle\Task\Crawler\PersistenceHandler;

use Psr\Http\Message\RequestInterface;
use VDB\Spider\Spider;
use VDB\Spider\QueueManager;
Expand Down Expand Up @@ -97,7 +96,7 @@ class CrawlerTask extends AbstractTask
/**
* @var bool
*/
protected $allowSubDomains = false;
protected $ownHostOnly = true;

/**
* @var int
Expand All @@ -109,8 +108,6 @@ class CrawlerTask extends AbstractTask
*/
public function isValid()
{
$this->allowSubDomains = false;

$filterLinks = $this->configuration->getConfig('filter');
$crawlerConfig = $this->configuration->getConfig('crawler');

Expand All @@ -123,6 +120,7 @@ public function isValid()
$this->invalidLinks = $this->getInvalidLinks();
$this->contentMaxSize = $crawlerConfig['content_max_size'];

$this->ownHostOnly = $this->configuration->getConfig('own_host_only');
$this->validMimeTypes = $this->configuration->getConfig('allowed_mime_types');
$this->allowedSchemes = $this->configuration->getConfig('allowed_schemes');
$this->downloadLimit = $crawlerConfig['max_download_limit'];
Expand Down Expand Up @@ -193,7 +191,10 @@ public function process($previousData)
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//link[@hreflang]|//a[not(@rel='nofollow')]"));

$spider->getDiscovererSet()->addFilter(new Filter\Prefetch\AllowedSchemeFilter($this->allowedSchemes));
$spider->getDiscovererSet()->addFilter(new Filter\Prefetch\AllowedHostsFilter([$this->seed], $this->allowSubDomains));

if ($this->ownHostOnly === true) {
$spider->getDiscovererSet()->addFilter(new Filter\Prefetch\AllowedHostsFilter([$this->seed], true));
}

if ($this->allowHashInUrl === false) {
$spider->getDiscovererSet()->addFilter(new Filter\Prefetch\UriWithHashFragmentFilter());
Expand Down

0 comments on commit 4ae7ac0

Please sign in to comment.