In [1]:
import math 
import numpy as no
from learnedbf import *
from learnedbf.BF import *
from learnedbf.classifiers import ScoredDecisionTreeClassifier
import numpy as np
from sklearn.datasets import make_classification

%load_ext autoreload
%autoreload 2

In [2]:
def optimal_bloom_filter_bits(n, p): 
    """ 
    Calculate the optimal number of bits (m) for a Bloom filter. 
    
    Parameters: 
    n (int): Number of keys 
    p (float): Desired false positive rate 
    Returns: 
    int: Optimal number of bits 
    """ 
    m = -(n * math.log(p)) / math.log(2) 
    return int(m)

def extract_negative_subsets1(X, y, size1, seed):
    """
    Extracts one subset from the negative items and returns it along 
       with the remaining training data.
    
    Parameters:
    X (array-like): Training set features.
    y (array-like): Training set labels.
    size1 (int): Fractions of negative items for the first subset.
    
    
    Returns:
    array-like: Two subsets of negative items and the remaining training data.
    """
    # Find the indices of negative items
    negative_indices = np.where(y == 0)[0]
    np.random.seed(seed)
    # Ensure the sizes do not exceed the number of negative items
    if size1 >= 1 :
        raise ValueError("The combined size of subsets exceeds the number of negative items.")
    
    # Shuffle the negative indices to ensure random selection
    np.random.shuffle(negative_indices)
    n = len(y)
    size1 = int(size1*n)
    
    # Extract the subsets
    if size1>0:
        subset1_indices = negative_indices[:size1]
        subset1_X = X[subset1_indices]    
        
        # Get the remaining data
        remaining_indices = np.setdiff1d(np.arange(X.shape[0]), 
                                     np.concatenate([subset1_indices]))
    else:            
        raise ValueError(" extract_negative_subsets: size must be > 0")
    
    remaining_X = X[remaining_indices]
    remaining_y = y[remaining_indices]
    
    return subset1_X , remaining_X, remaining_y

In [3]:


n_samples = 500000
n_clusters_per_class = 1
eps= [0.1, 0.01, 0.001]
class_sep_list = [0.1, 0.5, 1]
n_feat = 5
seed = 52

In [8]:


ext_LBF_fpr=[]
ext_LBF_ratio=[]
ext_LBF_space=[]
ext_SLBF_fpr=[]
ext_SLBF_ratio=[]
ext_SLBF_space=[]
ext_PLBF_fpr=[]
ext_PLBF_ratio=[]
ext_PLBF_space=[] 
ext_FLBF_use_BF = []
ext_FLBF_failed = []
ext_FLBF_fpr = []
ext_FLBF_space = []
ext_FLBF_ratio = []

ext_for_print = []
for j, epsilon in enumerate(eps):
    LBF_fpr=[]
    LBF_ratio=[]
    LBF_space=[]
    SLBF_fpr=[]
    SLBF_ratio=[]
    SLBF_space=[]
    PLBF_fpr=[]
    PLBF_ratio=[]
    PLBF_space=[] 
    FLBF_fpr = []
    FLBF_space = []
    FLBF_ratio = []
    FLBF_use_BF = []
    FLBF_failed = []
    for_print = []
    print(f"----------------------------\n--------------------------\nEpsilon: {epsilon}")
       
    for class_sep in class_sep_list:
        print(f"\t class_sep: {class_sep}")        
        X, y = make_classification(n_samples=n_samples, n_features=n_feat, n_informative=3, 
                                    n_redundant=1, n_clusters_per_class=n_clusters_per_class, 
                                    weights=[0.6, 0.4], flip_y=0, class_sep=class_sep, random_state=seed)
          
        #print(f"X.shape: {X.shape}")
        n_pos = np.sum(y==1)
        #print(f"Positives: {n_pos}")
        test_rate= 0.15
        X_test1, X, y = extract_negative_subsets1(X, y, test_rate, seed)
        
        #spazio necessario filtro classico
        bf_space = optimal_bloom_filter_bits(n_pos, epsilon)
        
        ###ALTRI FILTRI APPRESI##################à
        # print(f"\t\t----- Training LBF......")
        # dt = ScoredDecisionTreeClassifier(float_size=32)        
        # lbf = LBF(epsilon=epsilon, classifier=dt, threshold_test_size=0.2, 
        #           fpr_test_size=0, random_state=seed,
        #           hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
        # dt = lbf.classifier
        # lbf.fit(X, y.astype('bool'))
        # fpr = lbf.estimate_FPR(X_test1)
        # LBF_fpr.append(fpr)
        # tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
        # size_dict = lbf.get_size()
        # print(f"\t lbf size {size_dict}")
        # lbf_space = sum([size_dict[k] for k in size_dict])
        # LBF_space.append(lbf_space)
        # ratio = lbf_space / tmp_bf_space
        # LBF_ratio.append(ratio)

        #print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, lbf/bf={ratio:.3f}')


        print(f"\t\t----- Training SLBF......")
        dt = ScoredDecisionTreeClassifier(float_size=32)
        slbf = SLBF(epsilon=epsilon, classifier=dt, random_state=seed, fpr_test_size=0,
                    threshold_test_size=0.2,
                    hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
        slbf.fit(X, y.astype('bool'))
        fpr = slbf.estimate_FPR(X_test1)
        SLBF_fpr.append(fpr)
        #print(f"\t slbf fpr con DT: {fpr}")
        tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
        size_dict = slbf.get_size()
        print(f"\t slbf size {size_dict}")
        slbf_space = sum([size_dict[k] for k in size_dict])
        
        SLBF_space.append(slbf_space)
        ratio = slbf_space / tmp_bf_space
        SLBF_ratio.append(ratio)
        print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, slbf/bf={ratio:.3f}')

        
        
        print(f"\t\t----- Training PLBF......")
        dt = ScoredDecisionTreeClassifier(float_size=32)
        plbf = PLBF(epsilon=epsilon, classifier=dt, random_state=seed, fpr_test_size=0,
                    threshold_test_size=0.2,
                    hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
        plbf.fit(X, y.astype('bool'))
        fpr = plbf.estimate_FPR(X_test1)
        PLBF_fpr.append(fpr)
        tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
        size_dict = plbf.get_size()
        print(f"\t plbf size {size_dict}")
        plbf_space = sum([size_dict[k] for k in size_dict])
        
        PLBF_space.append(plbf_space)
        ratio = plbf_space / tmp_bf_space
        PLBF_ratio.append(ratio)
        print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, plbf/bf={ratio:.3f}')
                  
    
    ext_SLBF_fpr.append(SLBF_fpr)
    ext_SLBF_space.append(SLBF_space)
    ext_SLBF_ratio.append(SLBF_ratio)
    ext_PLBF_fpr.append(PLBF_fpr)
    ext_PLBF_space.append(PLBF_space)
    ext_PLBF_ratio.append(PLBF_ratio)

    ext_LBF_fpr.append(LBF_fpr)    
    ext_LBF_space.append(LBF_space)
    ext_LBF_ratio.append(LBF_ratio)
        #plot_results(epsilon, inner_dim_list, FLF_fpr, FLF_space, inner_classif, n, bf_space)
    #plot_results(epsilon, for_print, FLF_fpr, FLF_space, FLF_occ, inner_classif, n_pos, bf_space, cur_t)


----------------------------
--------------------------
Epsilon: 0.1
	 class_sep: 0.1
		----- Training SLBF......
	 slbf size {'backup_filter': 120110, 'initial_filter': 329478, 'classifier': 7872}
	class_sep=0.100, FPR=0.101, slbf/bf=0.690
		----- Training PLBF......
	 plbf size {'backup_filters': 242147, 'classifier': 7872}
	class_sep=0.100, FPR=0.107, plbf/bf=0.388
	 class_sep: 0.5
		----- Training SLBF......
	 slbf size {'backup_filter': 104121, 'initial_filter': 126313, 'classifier': 7872}
	class_sep=0.500, FPR=0.134, slbf/bf=0.410
		----- Training PLBF......
	 plbf size {'backup_filters': 113665, 'classifier': 7872}
	class_sep=0.500, FPR=0.167, plbf/bf=0.236
	 class_sep: 1
		----- Training SLBF......
	 slbf size {'backup_filter': 5171, 'initial_filter': 374233, 'classifier': 7872}
	class_sep=1.000, FPR=0.102, slbf/bf=0.587
		----- Training PLBF......
	 plbf size {'backup_filters': 12146, 'classifier': 7872}
	class_sep=1.000, FPR=0.101, plbf/bf=0.030
----------------------------
-

In [None]:


ext_LBF_fpr=[]
ext_LBF_ratio=[]
ext_LBF_space=[]
ext_SLBF_fpr=[]
ext_SLBF_ratio=[]
ext_SLBF_space=[]
ext_PLBF_fpr=[]
ext_PLBF_ratio=[]
ext_PLBF_space=[] 
ext_FLBF_use_BF = []
ext_FLBF_failed = []
ext_FLBF_fpr = []
ext_FLBF_space = []
ext_FLBF_ratio = []

ext_for_print = []
for j, m in enumerate([200000, 500000, 1000000]):
    LBF_fpr=[]
    LBF_ratio=[]
    LBF_space=[]
    SLBF_fpr=[]
    SLBF_ratio=[]
    SLBF_space=[]
    PLBF_fpr=[]
    PLBF_ratio=[]
    PLBF_space=[] 
    FLBF_fpr = []
    FLBF_space = []
    FLBF_ratio = []
    FLBF_use_BF = []
    FLBF_failed = []
    for_print = []
    print(f"----------------------------\n--------------------------\nEpsilon: {epsilon}")
       
    for class_sep in class_sep_list:
        print(f"\t class_sep: {class_sep}")        
        X, y = make_classification(n_samples=n_samples, n_features=n_feat, n_informative=3, 
                                    n_redundant=1, n_clusters_per_class=n_clusters_per_class, 
                                    weights=[0.6, 0.4], flip_y=0, class_sep=class_sep, random_state=seed)
          
        #print(f"X.shape: {X.shape}")
        n_pos = np.sum(y==1)
        #print(f"Positives: {n_pos}")
        test_rate= 0.15
        X_test1, X, y = extract_negative_subsets1(X, y, test_rate, seed)
        
        #spazio necessario filtro classico
        #bf_space = optimal_bloom_filter_bits(n_pos, epsilon)
        
        ###ALTRI FILTRI APPRESI##################à
        # print(f"\t\t----- Training LBF......")
        # dt = ScoredDecisionTreeClassifier(float_size=32)        
        # lbf = LBF(epsilon=epsilon, classifier=dt, threshold_test_size=0.2, 
        #           fpr_test_size=0, random_state=seed,
        #           hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
        # dt = lbf.classifier
        # lbf.fit(X, y.astype('bool'))
        # fpr = lbf.estimate_FPR(X_test1)
        # LBF_fpr.append(fpr)
        # tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
        # size_dict = lbf.get_size()
        # print(f"\t lbf size {size_dict}")
        # lbf_space = sum([size_dict[k] for k in size_dict])
        # LBF_space.append(lbf_space)
        # ratio = lbf_space / tmp_bf_space
        # LBF_ratio.append(ratio)

        #print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, lbf/bf={ratio:.3f}')


        print(f"\t\t----- Training SLBF......")
        dt = ScoredDecisionTreeClassifier(float_size=32)
        slbf = SLBF(m=m, classifier=dt, random_state=seed, fpr_test_size=0,
                    threshold_test_size=0.2,
                    hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
        slbf.fit(X, y.astype('bool'))
        fpr = slbf.estimate_FPR(X_test1)
        SLBF_fpr.append(fpr)
        #print(f"\t slbf fpr con DT: {fpr}")
        tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
        size_dict = slbf.get_size()
        print(f"\t slbf size {size_dict}")
        slbf_space = sum([size_dict[k] for k in size_dict])
        
        SLBF_space.append(slbf_space)
        ratio = slbf_space / tmp_bf_space
        SLBF_ratio.append(ratio)
        print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, slbf/bf={ratio:.3f}')

        
        
        print(f"\t\t----- Training PLBF......")
        dt = ScoredDecisionTreeClassifier(float_size=32)
        plbf = PLBF(m=m, classifier=dt, random_state=seed, fpr_test_size=0,
                    threshold_test_size=0.2,
                    hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
        plbf.fit(X, y.astype('bool'))
        fpr = plbf.estimate_FPR(X_test1)
        PLBF_fpr.append(fpr)
        tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
        size_dict = plbf.get_size()
        print(f"\t plbf size {size_dict}")
        plbf_space = sum([size_dict[k] for k in size_dict])
        
        PLBF_space.append(plbf_space)
        ratio = plbf_space / tmp_bf_space
        PLBF_ratio.append(ratio)
        print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, plbf/bf={ratio:.3f}')
                  
    
    ext_SLBF_fpr.append(SLBF_fpr)
    ext_SLBF_space.append(SLBF_space)
    ext_SLBF_ratio.append(SLBF_ratio)
    ext_PLBF_fpr.append(PLBF_fpr)
    ext_PLBF_space.append(PLBF_space)
    ext_PLBF_ratio.append(PLBF_ratio)

    ext_LBF_fpr.append(LBF_fpr)    
    ext_LBF_space.append(LBF_space)
    ext_LBF_ratio.append(LBF_ratio)
        #plot_results(epsilon, inner_dim_list, FLF_fpr, FLF_space, inner_classif, n, bf_space)
    #plot_results(epsilon, for_print, FLF_fpr, FLF_space, FLF_occ, inner_classif, n_pos, bf_space, cur_t)


NameError: name 'epsilon' is not defined

In [4]:
X, y = make_classification(n_samples=n_samples, n_features=n_feat, n_informative=3, 
                                    n_redundant=1, n_clusters_per_class=1, 
                                    weights=[0.6, 0.4], flip_y=0, class_sep=0.5, random_state=seed)

#print(f"X.shape: {X.shape}")
n_pos = np.sum(y==1)
#print(f"Positives: {n_pos}")
test_rate= 0.15
X_test1, X, y = extract_negative_subsets1(X, y, test_rate, seed)

lbf.fit(X, y.astype('bool'))
dt = lbf.classifier
fpr = lbf.estimate_FPR(X_test1)

print(f"\t\t FPR lbf: {fpr}")

#LBF_fpr.append(fpr)
tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
size_dict = lbf.get_size()
print(f"\t lbf size {size_dict}")
lbf_space = sum([size_dict[k] for k in size_dict])
ratio = lbf_space / tmp_bf_space


print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, lbf/bf={ratio:.3f}')


print(f"\t\t----- Training SLBF......")
#dt = ScoredDecisionTreeClassifier(float_size=32)
slbf = SLBF(epsilon=epsilon, classifier=dt, random_state=seed, fpr_test_size=0,
        threshold_test_size=0.2)#,
       #  hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
slbf.fit(X, y.astype('bool'))

fpr = slbf.estimate_FPR(X_test1)
print(f"\t\t FPR slbf: {fpr} ")

fpr = slbf.estimate_FPR(X_test1)
SLBF_fpr.append(fpr)
print(f"\t slbf fpr con DT: {fpr}")
tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
size_dict = slbf.get_size()
print(f"\t slbf size {size_dict}")
slbf_space = sum([size_dict[k] for k in size_dict])

#SLBF_space.append(slbf_space)
ratio = slbf_space / tmp_bf_space
#SLBF_ratio.append(ratio)
print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, slbf/bf={ratio:.3f}')

NameError: name 'lbf' is not defined

In [9]:
X, y = make_classification(n_samples=n_samples, n_features=n_feat, n_informative=3, 
                                    n_redundant=1, n_clusters_per_class=1, 
                                    weights=[0.6, 0.4], flip_y=0, class_sep=0.5, random_state=seed)

#print(f"X.shape: {X.shape}")
n_pos = np.sum(y==1)
#print(f"Positives: {n_pos}")
test_rate= 0.15
X_test1, X, y = extract_negative_subsets1(X, y, test_rate, seed)

lbf.fit(X, y.astype('bool'))
dt = lbf.classifier
fpr = lbf.estimate_FPR(X_test1)

print(f"\t\t FPR lbf: {fpr}")

#LBF_fpr.append(fpr)
tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
size_dict = lbf.get_size()
print(f"\t lbf size {size_dict}")
lbf_space = sum([size_dict[k] for k in size_dict])
ratio = lbf_space / tmp_bf_space


print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, lbf/bf={ratio:.3f}')


print(f"\t\t----- Training SLBF......")
dt = ScoredDecisionTreeClassifier(float_size=32)
slbf = SLBF(epsilon=epsilon, classifier=dt, random_state=seed, fpr_test_size=0,
        threshold_test_size=0.2,
       hyperparameters={'max_leaf_nodes':[5, 10, 20, 35, 50]})
slbf.fit(X, y.astype('bool'))

fpr = slbf.estimate_FPR(X_test1)
print(f"\t\t FPR slbf: {fpr} ")

fpr = slbf.estimate_FPR(X_test1)
SLBF_fpr.append(fpr)
print(f"\t slbf fpr con DT: {fpr}")
tmp_bf_space = optimal_bloom_filter_bits(n_pos, fpr)
size_dict = slbf.get_size()
print(f"\t slbf size {size_dict}")
slbf_space = sum([size_dict[k] for k in size_dict])

#SLBF_space.append(slbf_space)
ratio = slbf_space / tmp_bf_space
#SLBF_ratio.append(ratio)
print(f'\tclass_sep={class_sep:.3f}, FPR={fpr:.3f}, slbf/bf={ratio:.3f}')

		 FPR lbf: 9.333333333333333e-05
	 lbf size {'backup_filter': 2724900, 'classifier': 7872}
	class_sep=1.000, FPR=0.000, lbf/bf=1.021
		----- Training SLBF......
		 FPR slbf: 0.00092 
	 slbf fpr con DT: 0.00092
	 slbf size {'backup_filter': 662679, 'initial_filter': 953306, 'classifier': 7872}
	class_sep=1.000, FPR=0.001, slbf/bf=0.805
