Skip to content

Commit

Permalink
feat: allow rewriting in URL filter (breaking change, hence bumping t…
Browse files Browse the repository at this point in the history
…he version)
  • Loading branch information
joscha committed Nov 24, 2015
1 parent 4a5f07a commit 1affaec
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 19 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ An instance of CrawlKit has the following properties/methods:

* `.url`: `String` the URL where the crawling/scraping is supposed to start. This is automatically set from the `CrawlKit` constructor, but can be changed afterwards.
* `.finder`: `Function` allows you to set a method for link discovery that gets called on a page. See an example in `finders/genericAnchors.js`.
* `.urlFilter`: `Function` allows you to set a method for filtering discovered URLs. For an example see `examples/advanced.js`.
* `.urlFilter`: `Function` allows you to set a method for filtering and rewriting discovered URLs. The first parameter is the URL about to be added. The second parameter is the URL where this URL was discovered. Return `false` to discard the URL. Any other return value (as long as it is a valid URL) will be used instead. If you return a relative URL, it will be rewritten absolute to the URL where it was found. For an example see `examples/advanced.js`.
* `.addRunner(runnerId, runnerInstance)`: `void` allows you to add a runner that is executed on each crawled page. A runner instance has to have a `getCompanionFiles` method returning an array of (local) file paths and a `getRunnable` method returning a method to run in the context of the webpage. For an example see `examples/simple.js`.
* `.timeout`: `int` (ms) allows you to set the timeout for the finder and runners. The timeout starts fresh for each runner. Default is `10000` (10 seconds).
* `.concurrency`: `int` allows you to define how many Phantom browsers are used in parallel. Defaults to `1`.
Expand Down
6 changes: 5 additions & 1 deletion examples/advanced.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ const crawler = new CrawlKit(baseURL);

crawler.finder = genericAnchors;
crawler.urlFilter = function onlySameDomain(url) {
return urijs(url).domain() === urijs(baseURL).domain();
if (urijs(url).domain() !== urijs(baseURL).domain()) {
// discard URL
return false;
}
return url;
};

class TitleRunner {
Expand Down
16 changes: 12 additions & 4 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,18 @@ class CrawlKit {
urls.forEach((url) => {
try {
const uri = new URI(url);
const absoluteUrl = uri.absoluteTo(new URI(task.url)).toString();
if (self.urlFilter && !self.urlFilter(absoluteUrl)) {
workerDebug(`Discovered URL ${url} ignored due to URL filter.`);
return;
const fromUri = new URI(task.url);
let absoluteUrl = uri.absoluteTo(fromUri).toString();
if (self.urlFilter) {
const rewrittenUrl = self.urlFilter(absoluteUrl, task.url);
if (rewrittenUrl === false) {
workerDebug(`Discovered URL ${url} ignored due to URL filter.`);
return;
}
if (rewrittenUrl !== absoluteUrl) {
workerDebug(`${url} was rewritten to ${rewrittenUrl}.`);
absoluteUrl = new URI(rewrittenUrl).absoluteTo(fromUri).toString();
}
}
addUrl(absoluteUrl);
} catch (e) {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "crawlkit",
"version": "1.1.5",
"version": "1.2.0",
"description": "A crawler based on Phantom. Allows discovery of dynamic content and supports custom scrapers.",
"main": "index.js",
"repository": "https://github.com/crawlkit/crawlkit.git",
Expand Down
64 changes: 52 additions & 12 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -181,22 +181,62 @@ describe('CrawlKit', function main() {
});
});

it('and filter the results', () => {
const crawler = new CrawlKit(url);
describe('urlFilter', () => {
it('and filter the results', () => {
const crawler = new CrawlKit(url);

const results = {};
results[`${url}/`] = {};
results[`${url}/other.html`] = {};
const results = {};
results[`${url}/`] = {};
results[`${url}/other.html`] = {};

crawler.finder = genericLinkFinder;
crawler.finder = genericLinkFinder;

const spy = sinon.spy((u) => u.indexOf('somehash') === -1);
crawler.urlFilter = spy;
const spy = sinon.spy((u) => {
if (u.indexOf('somehash') !== -1) {
return false;
}
return u;
});
crawler.urlFilter = spy;

return crawler.crawl().then((result) => {
spy.callCount.should.equal(2);
return result.results;
}).should.eventually.deep.equal(results);
});

it('and rewrite the results', () => {
const crawler = new CrawlKit(url);

const results = {};
results[`${url}/`] = {};
results[`${url}/redirected.html`] = {};
results[`${url}/other.html`] = {};

return crawler.crawl().then((result) => {
spy.callCount.should.equal(2);
return result.results;
}).should.eventually.deep.equal(results);
crawler.finder = genericLinkFinder;

crawler.urlFilter = (u) => {
if (u.indexOf('somehash') !== -1) {
return 'redirected.html';
}
return u;
};

return crawler.crawl().should.eventually.deep.equal({results});
});

it('should handle faulty rewrites', () => {
const crawler = new CrawlKit(url);

const results = {};
results[`${url}/`] = {};

crawler.finder = genericLinkFinder;

crawler.urlFilter = () => {};

return crawler.crawl().should.eventually.deep.equal({results});
});
});
});

Expand Down

0 comments on commit 1affaec

Please sign in to comment.