In [1]:
!pip install mmh3 
!pip install bitarray 
from bitarray import bitarray 
import math 
import mmh3 
class BloomFilter(): 
    def __init__(self, expected_itemsCount,FP_probability): # define the init function
     
        self.FP_probability = FP_probability 
        self.array_size = self.get_arraySize(expected_itemsCount,FP_probability) # get array size based expected items count and false-positive probability 
        self.hashfuns_count = self.get_hashfuns_count(self.array_size,expected_itemsCount) #get the number of hash functions
        self.bit_array = bitarray(self.array_size) 
        self.bit_array.setall(0) # initialize all bits with zeros
  
    def add_item(self, item): # define the add method
        for i in range(self.hashfuns_count):  
            array_index = mmh3.hash(item,i) % self.array_size # provide a different seed to get a different hash value
            self.bit_array[array_index] = True #set the corresponding array bit
  
    def check_item(self, item): #  method to check if the element exists or not
        for i in range(self.hashfuns_count): 
            digest = mmh3.hash(item,i) % self.array_size 
            if self.bit_array[digest] == False: # if any bit is false, then the element is not present
                return False
        return True
  
    def get_hashfuns_count(self, array_size, n): 
        ''' Calculate the number of hash functions to use based on the following formula:
        hashfuns_count = (array_size/expected_itemsCount) * log(2) 
        '''
        
        hashfuns_count = (array_size/expected_itemsCount) * math.log(2) 
        return int(hashfuns_count) 

    def get_arraySize(self,expected_itemsCount,FP_probability): 
        '''Calculate the size of the array based on the false positive 
            probability and number of expected items to store 
            array_size = -(expected_itemsCount * log(fp_prob))/(log(2)^2)  
        '''
        array_size = -( expected_itemsCount * math.log(FP_probability) )/( math.log(2)**2 ) 
        return int(array_size) #cast the result to int

Collecting mmh3
  Downloading https://files.pythonhosted.org/packages/fa/7e/3ddcab0a9fcea034212c02eb411433db9330e34d626360b97333368b4052/mmh3-2.5.1.tar.gz
Building wheels for collected packages: mmh3
  Building wheel for mmh3 (setup.py) ... [?25l[?25hdone
  Created wheel for mmh3: filename=mmh3-2.5.1-cp36-cp36m-linux_x86_64.whl size=37862 sha256=563c8f90b97782b784f2b7b6bbcd37e23342c0e6f58e2f3b44d59843e38e5ce2
  Stored in directory: /root/.cache/pip/wheels/38/b4/ea/6e4e321c625d3320c0c496bf4088371546d8fce5f1dd71b219
Successfully built mmh3
Installing collected packages: mmh3
Successfully installed mmh3-2.5.1
Collecting bitarray
  Downloading https://files.pythonhosted.org/packages/c7/2a/35d3bd5bffa9e179267318057a12adc41f837310edf043d8e6d939719f95/bitarray-1.0.1.tar.gz
Building wheels for collected packages: bitarray
  Building wheel for bitarray (setup.py) ... [?25l[?25hdone
  Created wheel for bitarray: filename=bitarray-1.0.1-cp36-cp36m-linux_x86_64.whl size=76956 sha256=01edfb28ce

In [2]:
#test the bloom filter 
from random import shuffle 

FP_probability = 0.01 #false positive probability   
expected_itemsCount = 10 #no of items expected to add 
bloomfilter = BloomFilter(expected_itemsCount,FP_probability)  
print("Size of the array: {} , and the number of hash functions: {}".format(bloomfilter.array_size,bloomfilter.hashfuns_count)) 
  
# a list of countries to be added 
country_present = ['Albania','Australia','Greece','Canada','China','Mexico','Cyprus','France','Netherlands','Norway','Poland','Portugal']   
# countries not added 
country_absent = ['Romania','Russia','Singapore','Slovakia','Slovenia','Spain']
  
for item in country_present: 
    bloomfilter.add_item(item) #add the items
shuffle(country_absent)   
shuffle(country_present) 
  
test_countries = country_present[:6] + country_absent 
shuffle(test_countries) #shuffle the test list
for country in test_countries: 
    if not bloomfilter.check_item(country): #check the items
      print(" '{}'  it's not present!".format(country))
      
    else:
      if country in country_absent: 
            print("'{}' its a false positive ".format(country)) 
      else: 
            print("'{}' it's probably present!".format(country)) 

Size of the array: 95 , and the number of hash functions: 6
'China' it's probably present!
 'Slovenia'  it's not present!
 'Slovakia'  it's not present!
'Australia' it's probably present!
'Cyprus' it's probably present!
'Poland' it's probably present!
'Canada' it's probably present!
'Albania' it's probably present!
 'Spain'  it's not present!
 'Singapore'  it's not present!
 'Romania'  it's not present!
 'Russia'  it's not present!
