## Bloom Filter Implementation
- Reference
   - https://en.wikipedia.org/wiki/Bloom_filter
   - https://brilliant.org/wiki/bloom-filter/#:~:text=A%20bloom%20filter%20is%20a,is%20added%20to%20the%20set.
   - https://www.geeksforgeeks.org/bloom-filters-introduction-and-python-implementation/

In [None]:
from typing import List, Tuple
from itertools import product

import matplotlib.pyplot as plt
import mmh3
import pandas as pd
import seaborn as sns
from bitarray import bitarray


In [None]:
import hashlib

class BloomFilter:
    def __init__(self, m, k):
        self.m = m
        self.k = k
        self.data = [0]*m
        self.n = 0
    def insert(self, element):
        if self.k == 1:
            hash1 = h1(element) % self.m
            self.data[hash1] = 1
        elif self.k == 2:
            hash1 = h1(element) % self.m
            hash2 = h2(element) % self.m
            self.data[hash1] = 1
            self.data[hash2] = 1
        self.n += 1
    def search(self, element):
        if self.k == 1:
            hash1 = h1(element) % self.m
            if self.data[hash1] == 0:
                return "Not in Bloom Filter"
        elif self.k == 2:
            hash1 = h1(element) % self.m
            hash2 = h2(element) % self.m
            if self.data[hash1] == 0 or self.data[hash2] == 0:
                return "Not in Bloom Filter"
        prob = (1.0 - ((1.0 - 1.0/self.m)**(self.k*self.n))) ** self.k
        return "Might be in Bloom Filter with false positive probability "+str(prob)

def h1(w):
    h = hashlib.md5(w)
    return hash(h.digest().encode('base64')[:6])%10

def h2(w):
    h = hashlib.sha256(w)
    return hash(h.digest().encode('base64')[:6])%10

In [None]:
class BloomFilter:

    def __init__(self, num_bits: int, num_hash: int) -> None:
        self.num_bits = num_bits
        self.num_hash = num_hash

        self.bit_array = bitarray(self.num_bits)
        self.bit_array.setall(False)
    
        self.counter = 0
        self.seeds = [seed for seed in range(self.num_hash)]

    def insert(self, element: str) -> None:
        for seed in self.seeds:
            hash_value = mmh3.hash(element, seed) % self.num_bits
            self.bit_array[hash_value] = True

        self.counter += 1

    def does_exist(self, element: str) -> bool:
        for seed in self.seeds:
            hash_value = mmh3.hash(element, seed) % self.num_bits
            if not self.bit_array[hash_value]:
                return False
        return True
        
    def predict_false_positive_rate(self) -> float:
        return (1 - ((1 - 1 / self.num_bits) ** (self.num_hash * self.counter))) ** self.num_hash


In [None]:
def examine_result(
    num_bits: int,
    num_hash: int,
    positive_elements: List[str],
    test_elements: List[str],
) -> Tuple[float, float]:
    bloom_filter = BloomFilter(num_bits=num_bits, num_hash=num_hash)
    for element in positive_elements:
        bloom_filter.insert(element)

    predicted_false_positive_rate = bloom_filter.predict_false_positive_rate()
    actual_false_positive_rate = sum(bloom_filter.does_exist(element) for element in test_elements) / len(test_elements)
    return predicted_false_positive_rate, actual_false_positive_rate


### Example

In [None]:
present_words = "Yesterday All my troubles seemed so far away Now it looks as though they're here to stay Oh, I believe in yesterday".split()
absent_words = "When find myself times of trouble".split()

#### Case I

In [None]:
predicted_false_positive_rate, actual_false_positive_rate = examine_result(128, 10, present_words, absent_words)
print(f"m = {m}, k = {k}")
print(f"The estimate false positive probability is {predicted_false_positive_rate}")
print(f"The actual false positive probability is is {actual_false_positive_rate}")

#### Case II - Comparison

In [None]:
m_candidates = [2, 4, 8, 16, 32, 64, 128]
k_candidates = [2, 4, 8, 16, 32, 64, 128]
results = [
    (m, k, *examine_result(m, k, present_words, absent_words))
    for m, k in product(m_candidates, k_candidates)
]
result_df = pd.DataFrame(results, columns=["m", "k", "predicted_fpr", "actual_fpr"])


In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2)
fig.subplots_adjust(wspace=0.1)

sns.heatmap(result_df.pivot("m", "k", "predicted_fpr"), ax=ax1, cbar=False)
ax1.set_title('Predicted False Positive Rate')
sns.heatmap(result_df.pivot("m", "k", "actual_fpr"), ax=ax2, cbar=False)
ax2.set_title('Actual False Positive Rate')

ax2.yaxis.tick_right()
fig.subplots_adjust(wspace=0.1)
plt.show()