Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
116 lines (106 sloc) 3.16 KB
<?php
namespace nsqphp\Dedupe;
use nsqphp\Message\MessageInterface;
/**
* Lossy hash map that is able to tell us whether we have seen a message
* before, with a chance of a false negative (eg: saying we haven't, when we
* have), but no chance of a false positive (eg: saying we have when we
* haven't).
*
* Stores the hash map internally as a PHP array; hence bound within a
* single specific process (see Memcached implementation for one that uses
* external memory storage).
*
* This actually uses a hash of the content, so theoretically if both hash
* functions collided (eg: the one to pick index and the one to hash content)
* then we would return a false positive. I'm assuming this is vanishingly
* small. This should probably be investigated some more.
*
* May contain cruft.
*
* http://somethingsimilar.com/2012/05/21/the-opposite-of-a-bloom-filter/
*/
class OppositeOfBloomFilter implements DedupeInterface
{
/**
* Deleted placeholder
*/
const DELETED = 'D';
/**
* Hash map
*
* @var array
*/
private $map = array();
/**
* Size of hash map
*
* @var integer
*/
private $size;
/**
*
* @param integer $size
*/
public function __construct($size = 100000)
{
$this->size = $size;
}
/**
* Contains and add
*
* Test if we have seen this message before, whilst also adding to our
* knowledge the fact we have seen it now. We deduplicate against message
* content, topic and channel (eg: all three have to be the same to consider
* as a duplicate).
*
* @param string $topic
* @param string $channel
* @param MessageInterface $msg
*
* @return boolean
*/
public function containsAndAdd($topic, $channel, MessageInterface $msg)
{
$hashed = $this->hash($topic, $channel, $msg);
$this->map[$hashed['index']] = $hashed['content'];
return $hashed['seen'];
}
/**
* Remove knowledge of msg
*
* Test if we have seen this message before and if we have (eg: if we still
* have knowledge of the message) then "remove" it (so that we won't think
* we have seen it).
*
* @param string $topic
* @param string $channel
* @param MessageInterface $msg
*/
public function erase($topic, $channel, MessageInterface $msg)
{
$hashed = $this->hash($topic, $channel, $msg);
if ($hashed['seen']) {
$this->map[$hashed['index']] = self::DELETED;
}
}
/**
* Get bucket / content hash
*
* @param string $topic
* @param string $channel
* @param MessageInterface $msg
*
* @return array index, content, seen (boolean)
*/
private function hash($topic, $channel, MessageInterface $msg)
{
$element = "$topic:$channel:" . $msg->getPayload();
$hash = hash('adler32', $element, TRUE);
list(, $val) = unpack('N', $hash);
$index = $val % $this->size;
$content = md5($element);
$seen = isset($this->map[$index]) && $this->map[$index] === $content;
return array('index' => $index, 'content' => $content, 'seen' => $seen);
}
}