feat: allow rewriting in URL filter (breaking change, hence bumping t…

…he version)
crawlkit · Nov 24, 2015 · 1affaec · 1affaec
1 parent 4a5f07a
commit 1affaec
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ An instance of CrawlKit has the following properties/methods:
 
 * `.url`: `String` the URL where the crawling/scraping is supposed to start. This is automatically set from the `CrawlKit` constructor, but can be changed afterwards.
 * `.finder`: `Function` allows you to set a method for link discovery that gets called on a page. See an example in `finders/genericAnchors.js`.
-* `.urlFilter`: `Function` allows you to set a method for filtering discovered URLs. For an example see `examples/advanced.js`.
+* `.urlFilter`: `Function` allows you to set a method for filtering and rewriting discovered URLs. The first parameter is the URL about to be added. The second parameter is the URL where this URL was discovered. Return `false` to discard the URL. Any other return value (as long as it is a valid URL) will be used instead. If you return a relative URL, it will be rewritten absolute to the URL where it was found. For an example see `examples/advanced.js`.
 * `.addRunner(runnerId, runnerInstance)`: `void` allows you to add a runner that is executed on each crawled page. A runner instance has to have a `getCompanionFiles` method returning an array of (local) file paths and a `getRunnable` method returning a method to run in the context of the webpage. For an example see `examples/simple.js`.
 * `.timeout`: `int` (ms) allows you to set the timeout for the finder and runners. The timeout starts fresh for each runner. Default is `10000` (10 seconds).
 * `.concurrency`: `int` allows you to define how many Phantom browsers are used in parallel. Defaults to `1`.

diff --git a/examples/advanced.js b/examples/advanced.js
@@ -8,7 +8,11 @@ const crawler = new CrawlKit(baseURL);
 
 crawler.finder = genericAnchors;
 crawler.urlFilter = function onlySameDomain(url) {
-    return urijs(url).domain() === urijs(baseURL).domain();
+    if (urijs(url).domain() !== urijs(baseURL).domain()) {
+        // discard URL
+        return false;
+    }
+    return url;
 };
 
 class TitleRunner {

diff --git a/index.js b/index.js
@@ -267,10 +267,18 @@ class CrawlKit {
                                         urls.forEach((url) => {
                                             try {
                                                 const uri = new URI(url);
-                                                const absoluteUrl = uri.absoluteTo(new URI(task.url)).toString();
-                                                if (self.urlFilter && !self.urlFilter(absoluteUrl)) {
-                                                    workerDebug(`Discovered URL ${url} ignored due to URL filter.`);
-                                                    return;
+                                                const fromUri = new URI(task.url);
+                                                let absoluteUrl = uri.absoluteTo(fromUri).toString();
+                                                if (self.urlFilter) {
+                                                    const rewrittenUrl = self.urlFilter(absoluteUrl, task.url);
+                                                    if (rewrittenUrl === false) {
+                                                        workerDebug(`Discovered URL ${url} ignored due to URL filter.`);
+                                                        return;
+                                                    }
+                                                    if (rewrittenUrl !== absoluteUrl) {
+                                                        workerDebug(`${url} was rewritten to ${rewrittenUrl}.`);
+                                                        absoluteUrl = new URI(rewrittenUrl).absoluteTo(fromUri).toString();
+                                                    }
                                                 }
                                                 addUrl(absoluteUrl);
                                             } catch (e) {

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "crawlkit",
-  "version": "1.1.5",
+  "version": "1.2.0",
   "description": "A crawler based on Phantom. Allows discovery of dynamic content and supports custom scrapers.",
   "main": "index.js",
   "repository": "https://github.com/crawlkit/crawlkit.git",

diff --git a/test/index.js b/test/index.js
@@ -181,22 +181,62 @@ describe('CrawlKit', function main() {
             });
         });
 
-        it('and filter the results', () => {
-            const crawler = new CrawlKit(url);
+        describe('urlFilter', () => {
+            it('and filter the results', () => {
+                const crawler = new CrawlKit(url);
 
-            const results = {};
-            results[`${url}/`] = {};
-            results[`${url}/other.html`] = {};
+                const results = {};
+                results[`${url}/`] = {};
+                results[`${url}/other.html`] = {};
 
-            crawler.finder = genericLinkFinder;
+                crawler.finder = genericLinkFinder;
 
-            const spy = sinon.spy((u) => u.indexOf('somehash') === -1);
-            crawler.urlFilter = spy;
+                const spy = sinon.spy((u) => {
+                  if (u.indexOf('somehash') !== -1) {
+                      return false;
+                  }
+                  return u;
+                });
+                crawler.urlFilter = spy;
+
+                return crawler.crawl().then((result) => {
+                    spy.callCount.should.equal(2);
+                    return result.results;
+                }).should.eventually.deep.equal(results);
+            });
+
+            it('and rewrite the results', () => {
+                const crawler = new CrawlKit(url);
+
+                const results = {};
+                results[`${url}/`] = {};
+                results[`${url}/redirected.html`] = {};
+                results[`${url}/other.html`] = {};
 
-            return crawler.crawl().then((result) => {
-                spy.callCount.should.equal(2);
-                return result.results;
-            }).should.eventually.deep.equal(results);
+                crawler.finder = genericLinkFinder;
+
+                crawler.urlFilter = (u) => {
+                  if (u.indexOf('somehash') !== -1) {
+                      return 'redirected.html';
+                  }
+                  return u;
+                };
+
+                return crawler.crawl().should.eventually.deep.equal({results});
+            });
+
+            it('should handle faulty rewrites', () => {
+                const crawler = new CrawlKit(url);
+
+                const results = {};
+                results[`${url}/`] = {};
+
+                crawler.finder = genericLinkFinder;
+
+                crawler.urlFilter = () => {};
+
+                return crawler.crawl().should.eventually.deep.equal({results});
+            });
         });
     });