diff --git a/README.md b/README.md index 6523957..4fdace8 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ An instance of CrawlKit has the following properties/methods: * `.url`: `String` the URL where the crawling/scraping is supposed to start. This is automatically set from the `CrawlKit` constructor, but can be changed afterwards. * `.finder`: `Function` allows you to set a method for link discovery that gets called on a page. See an example in `finders/genericAnchors.js`. -* `.urlFilter`: `Function` allows you to set a method for filtering discovered URLs. For an example see `examples/advanced.js`. +* `.urlFilter`: `Function` allows you to set a method for filtering and rewriting discovered URLs. The first parameter is the URL about to be added. The second parameter is the URL where this URL was discovered. Return `false` to discard the URL. Any other return value (as long as it is a valid URL) will be used instead. If you return a relative URL, it will be rewritten absolute to the URL where it was found. For an example see `examples/advanced.js`. * `.addRunner(runnerId, runnerInstance)`: `void` allows you to add a runner that is executed on each crawled page. A runner instance has to have a `getCompanionFiles` method returning an array of (local) file paths and a `getRunnable` method returning a method to run in the context of the webpage. For an example see `examples/simple.js`. * `.timeout`: `int` (ms) allows you to set the timeout for the finder and runners. The timeout starts fresh for each runner. Default is `10000` (10 seconds). * `.concurrency`: `int` allows you to define how many Phantom browsers are used in parallel. Defaults to `1`. diff --git a/examples/advanced.js b/examples/advanced.js index ac5d7a7..ff0d5be 100644 --- a/examples/advanced.js +++ b/examples/advanced.js @@ -8,7 +8,11 @@ const crawler = new CrawlKit(baseURL); crawler.finder = genericAnchors; crawler.urlFilter = function onlySameDomain(url) { - return urijs(url).domain() === urijs(baseURL).domain(); + if (urijs(url).domain() !== urijs(baseURL).domain()) { + // discard URL + return false; + } + return url; }; class TitleRunner { diff --git a/index.js b/index.js index 0198eba..fb47e26 100644 --- a/index.js +++ b/index.js @@ -267,10 +267,18 @@ class CrawlKit { urls.forEach((url) => { try { const uri = new URI(url); - const absoluteUrl = uri.absoluteTo(new URI(task.url)).toString(); - if (self.urlFilter && !self.urlFilter(absoluteUrl)) { - workerDebug(`Discovered URL ${url} ignored due to URL filter.`); - return; + const fromUri = new URI(task.url); + let absoluteUrl = uri.absoluteTo(fromUri).toString(); + if (self.urlFilter) { + const rewrittenUrl = self.urlFilter(absoluteUrl, task.url); + if (rewrittenUrl === false) { + workerDebug(`Discovered URL ${url} ignored due to URL filter.`); + return; + } + if (rewrittenUrl !== absoluteUrl) { + workerDebug(`${url} was rewritten to ${rewrittenUrl}.`); + absoluteUrl = new URI(rewrittenUrl).absoluteTo(fromUri).toString(); + } } addUrl(absoluteUrl); } catch (e) { diff --git a/package.json b/package.json index 849887b..b55736e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "crawlkit", - "version": "1.1.5", + "version": "1.2.0", "description": "A crawler based on Phantom. Allows discovery of dynamic content and supports custom scrapers.", "main": "index.js", "repository": "https://github.com/crawlkit/crawlkit.git", diff --git a/test/index.js b/test/index.js index b80abc0..2928e87 100644 --- a/test/index.js +++ b/test/index.js @@ -181,22 +181,62 @@ describe('CrawlKit', function main() { }); }); - it('and filter the results', () => { - const crawler = new CrawlKit(url); + describe('urlFilter', () => { + it('and filter the results', () => { + const crawler = new CrawlKit(url); - const results = {}; - results[`${url}/`] = {}; - results[`${url}/other.html`] = {}; + const results = {}; + results[`${url}/`] = {}; + results[`${url}/other.html`] = {}; - crawler.finder = genericLinkFinder; + crawler.finder = genericLinkFinder; - const spy = sinon.spy((u) => u.indexOf('somehash') === -1); - crawler.urlFilter = spy; + const spy = sinon.spy((u) => { + if (u.indexOf('somehash') !== -1) { + return false; + } + return u; + }); + crawler.urlFilter = spy; + + return crawler.crawl().then((result) => { + spy.callCount.should.equal(2); + return result.results; + }).should.eventually.deep.equal(results); + }); + + it('and rewrite the results', () => { + const crawler = new CrawlKit(url); + + const results = {}; + results[`${url}/`] = {}; + results[`${url}/redirected.html`] = {}; + results[`${url}/other.html`] = {}; - return crawler.crawl().then((result) => { - spy.callCount.should.equal(2); - return result.results; - }).should.eventually.deep.equal(results); + crawler.finder = genericLinkFinder; + + crawler.urlFilter = (u) => { + if (u.indexOf('somehash') !== -1) { + return 'redirected.html'; + } + return u; + }; + + return crawler.crawl().should.eventually.deep.equal({results}); + }); + + it('should handle faulty rewrites', () => { + const crawler = new CrawlKit(url); + + const results = {}; + results[`${url}/`] = {}; + + crawler.finder = genericLinkFinder; + + crawler.urlFilter = () => {}; + + return crawler.crawl().should.eventually.deep.equal({results}); + }); }); });