## Bloom filter implementation
- Reference
   - https://en.wikipedia.org/wiki/Bloom_filter
   - https://brilliant.org/wiki/bloom-filter/#:~:text=A%20bloom%20filter%20is%20a,is%20added%20to%20the%20set.
   - https://www.geeksforgeeks.org/bloom-filters-introduction-and-python-implementation/

In [71]:
from typing import List
from itertools import product

import mmh3
from bitarray import bitarray


In [75]:
class BloomFilter:

    def __init__(self, num_bits: int, num_hash: int) -> None:
        self.num_bits = num_bits
        self.num_hash = num_hash

        self.bit_array = bitarray(self.num_bits)
        self.bit_array.setall(False)
    
        self.counter = 0
        self.seeds = [seed for seed in range(self.num_hash)]

    def insert(self, element: str) -> None:
        for seed in self.seeds:
            hash_value = mmh3.hash(element, seed) % self.num_bits
            self.bit_array[hash_value] = True

        self.counter += 1

    def does_exist(self, element: str) -> bool:
        for seed in self.seeds:
            hash_value = mmh3.hash(element, seed) % self.num_bits
            if self.bit_array[hash_value] == False:
                return False
        return True
        
    def predict_false_positive_rate(self) -> float:
        return (1 - ((1 - 1 / self.num_hash) ** (self.num_hash * self.counter))) ** self.num_hash


In [76]:
def examine_result(
    num_bits: int,
    num_hash: int,
    positive_elements: List[str],
    test_elements: List[str],
) -> tuple:
    bloom_filter = BloomFilter(num_bits=num_bits, num_hash=num_hash)
    for element in positive_elements:
        bloom_filter.insert(element)

    predicted_false_positive_rate = bloom_filter.predict_false_positive_rate()
    actual_false_positive_rate = sum(bloom_filter.does_exist(element) for element in test_elements) / len(test_elements)
    return predicted_false_positive_rate, actual_false_positive_rate


### Example

In [61]:
present_words = "Yesterday All my troubles seemed so far away Now it looks as though they're here to stay Oh, I believe in yesterday".split()
absent_words = "All my troubles seemed so far away".split()

In [None]:
m_candidates = [1, 10, 100, 1000]
k_candidates = [1, 10, 100, 1000]

for m, k in product(m_candidates, k_candidates):
    predicted_false_positive_rate, actual_false_positive_rate = examine_result(m, k, present_words, present_words + absent_words)
    print(f"m = {m}, k = {k}")
    print(f"The estimate false positive probability is {predicted_false_positive_rate}")
    print(f"The actual false positive probability is is {actual_false_positive_rate}")
