diff --git a/README.md b/README.md index d475f5f..8c7aa3b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ Uses the implementation of Rabin fingerprinting from [LBFS](https://github.com/f Rabin fingerprinting is useful for finding the chunks of a file that differ from a previous version. It's one implementation of a technique called "Content-defined chunking", meaning the chunk boundaries are determinstic to the content (as opposed to "fixed-sized chunking"). -## API +Theres a JavaScript API and an accompanying command-line tool. + +## JavaScript API ### `var createRabin = require('rabin')` @@ -16,7 +18,7 @@ Rabin fingerprinting is useful for finding the chunks of a file that differ from `rabin` is a duplex stream. You write raw data in, and buffers chunked by rabin fingerprints will be written out. -## Example +## JavaScript Example ```js // require and create an instance @@ -32,3 +34,20 @@ rabin.on('data', function (chunk) { // and splitting on each rabin fingerprint found }) ``` + +## CLI API + +``` +$ npm install rabin -g +$ rabin myfile.txt --bits=14 --min=8192 --max=32768 # defaults +{"length":12182,"offset":0,"hash":"5df6245b5897336ebf611d7f10fb90eea2d63c5b9ec9ad76dfb1ac72b8249dcb"} +{"length":13190,"offset":12182,"hash":"67d5aaac9cf7b8432cb3c8071d726dc38f1138957c30719f8b166116a90950a1"} +{"length":11609,"offset":25372,"hash":"976a0e3dc43de3abdf50b984a102c5fb7c2550e3dc5e44e4a8f7d4241276683b"} +{"length":10010,"offset":36981,"hash":"7145d10f93ea03e6c8b4dd5ab148e2c3c08f9c71bf71c7559dffdfcef48112c1"} +{"length":13623,"offset":46991,"hash":"76470d5047f9fb31bd75364d90355fdbf913aaa1df934251f43c894f01381f1b"} +{"length":8197,"offset":60614,"hash":"88abce05bc75f72cdafeabd5125eb46fa8f73eab2d75a29076aeb3f99ef35548"} +{"length":16242,"offset":68811,"hash":"08d60789c1e901d6a8e474aeb5de4746af1648e7f3a4ac7a3dba87d9e73fca56"} +{"length":14947,"offset":85053,"hash":"4224e6f4361fa8bdefb9d8e10ebd046e2869af2c44ea7e84c7efaeedd5423b30"} +average 12500 +``` + diff --git a/bindings.cc b/bindings.cc index a594124..5e64d1f 100644 --- a/bindings.cc +++ b/bindings.cc @@ -32,13 +32,21 @@ void get_fingerprints(rabin_t *hasher, Local bufs, Local lengths) NAN_METHOD(Initialize) { if (instance_counter >= 1024) return Nan::ThrowError("the value of instance_counter is too damn high"); struct rabin_t *hasher = (struct rabin_t *) malloc(sizeof(struct rabin_t)); + + if (!info[0]->IsNumber()) return Nan::ThrowError("first arg must be a number"); + if (!info[1]->IsNumber()) return Nan::ThrowError("second arg must be a number"); + if (!info[2]->IsNumber()) return Nan::ThrowError("third arg must be a number"); + + hasher->average_bits = info[0]->Uint32Value(); + hasher->minsize = info[1]->Uint32Value(); + hasher->maxsize = info[2]->Uint32Value(); + + // Open a pull request if you need these to be configurable + hasher->mask = ((1<average_bits)-1); hasher->polynomial = 0x3DA3358B4DC173LL; hasher->polynomial_degree = 53; - hasher->average_bits = 14; - hasher->minsize = 8 * 1024; - hasher->maxsize = 32 * 1024; - hasher->mask = ((1<average_bits)-1); hasher->polynomial_shift = (hasher->polynomial_degree-8); + rabin_init(hasher); instances[instance_counter++] = hasher; info.GetReturnValue().Set(instance_counter - 1); diff --git a/cli.js b/cli.js index caeb872..78f446d 100755 --- a/cli.js +++ b/cli.js @@ -1,9 +1,10 @@ #!/usr/bin/env node var fs = require('fs') var crypto = require('crypto') -var rabin = require('./')() +var args = require('minimist')(process.argv.slice(2)) +var rabin = require('./')(args) var offset = 0 -var rs = fs.createReadStream(process.argv[2]) +var rs = fs.createReadStream(args._[0]) var count = 0 rs.pipe(rabin).on('data', function (ch) { offset += ch.length diff --git a/index.js b/index.js index f7ffac6..a682966 100644 --- a/index.js +++ b/index.js @@ -6,11 +6,14 @@ var debug = require('debug')('rabin') module.exports = Rabin -function Rabin () { - if (!(this instanceof Rabin)) return new Rabin() +function Rabin (opts) { + if (!(this instanceof Rabin)) return new Rabin(opts) this.destroyed = false this.rabinEnded = false - this.rabin = rabin.initialize() + var avgBits = +opts.bits || 12 + var min = +opts.min || 8 * 1024 + var max = +opts.max || 32 * 1024 + this.rabin = rabin.initialize(avgBits, min, max) this.nextCb = null this.buffers = new BufferList() this.on('finish', function () { diff --git a/package.json b/package.json index 9022572..49f432d 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,7 @@ "bin": { "rabin": "cli.js" }, + "keywords": ["rabin", "cdc", "chunking", "fingerprint", "rolling hash", "dedupe", "deduplication", "rsync"], "scripts": { "test": "echo \"Error: no test specified\" && exit 1", "install": "prebuild --download && echo \"Installed prebuilt binary successfully.\n\"", @@ -18,6 +19,7 @@ "bindings": "^1.2.1", "bl": "^1.0.0", "debug": "^2.2.0", + "minimist": "^1.2.0", "nan": "^2.1.0", "prebuild": "^2.6.2", "readable-stream": "^2.0.4"