In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Functions

In [9]:
# Functions

def load_dataset(path, 
                 dataset_filename, 
                 sep=",", 
                 header=None,
                 nrows=None,
                 names=None, 
                 index_col=False, 
                 na_values='?'):
    
    return pd.read_csv(path+"/"+dataset_filename, 
                       sep=sep, 
                       header=header,
                       nrows=nrows,
                       names=colnames, 
                       index_col=index_col, 
                       na_values='?')

def binarize_data(data, label, threshold):
    data[label] = np.where(data[label] >= threshold, 1, -1)

def serialize_dataset(p_data, 
                      path, 
                      dataset_filename, 
                      suffix, 
                      sep=",", 
                      compression="bz2", 
                      index=False):
    
    p_data.to_csv(path+"/"+dataset_filename.split(".")[0]+suffix, 
                  sep=sep, 
                  compression=compression, 
                  index=index)

In [3]:
def perturb_instance(x, rules, budget, max_budget_per_feature, thresholds):
    """
    Returns the set of possible perturbations of a given instance.

    This function takes as input an instance and returns a set of perturbations of that instance, 
    using the specified amount of budget and considering the cost of perturbing each individual feature.

    Parameters
    ----------
    x : pandas.Series
        The original instance
    rules : list
        The list of modification rules
    budget : float
        The attacker's budget
    max_budget_per_feature : dict
        The maximum allowed amount of budget units that can be spend on each feature
    thresholds : dict
        feature -> list of relevant thresholds

    Returns
    -------
    pandas.DataFrame
        The set of perturbations (including the original instance, placed at the very beginning)
    """
    
    # initialize the queue (FIFO) with both the original instance, 
    # the initial budget, and an empty dictionary of budget units spent so far
    queue = [(x, budget, {})]
    # visited perturbations
    seen = { tuple(x): [budget, {}] }
    # initialize the set of perturbations of this instance with the empty list
    # perturbations = []
    
    # loop until the queue is not empty
    while len(queue)>0:
        item = queue.pop() # dequeue the first inserted element
        x = item[0] # get the instance
        b = item[1] # get the residual budget
        budget_units_spent = item[2] # get the dictionary containing the amount of budget spent on each feature, so far
        
        # loop through all the features subject to the set of attack rules
        for r in rules:
            f = r['f']  # feature the rule applies to
            # check budget
            if not( r['cost'] <= b and budget_units_spent.get(f, 0) + r['cost'] <= max_budget_per_feature[f] ):
                continue
            # check validity
            if not r['valid'](x):
                continue
            
            # apply rule to a copy
            x_atks = []
            if r['is_cat']:
                xx = x.copy()
                xx[f] = r['value']
                x_atks += [xx]
            else:
#                 xx = x.copy()
#                 xx[f] += r['value']
#                 x_atks += [xx]
                
                # Evaluate crossing of multiple thresholds
                low,high=sorted([x[f], x[f]+r['value']])
                z = set(thresholds[f][np.logical_and(thresholds[f]>=low, thresholds[f]<=high)])
                z |= set([low,high])
                for zi in z:
                    xx = x.copy()
                    xx[f] = zi
                    x_atks += [xx]

                # we are adding all of this to both seen and queue
                # the smallest element might not be included in seen
                #     as it is not an interesting attack
                # still we are missing managing of rule validity thresholds
#                 print (x)
#                 print (x_prime)
#                 print (thresholds[f])
#                 print (thresholds[f][crossings[0]:crossings[1]])

            # process all atks
            for xx in x_atks:
                # skip if already seen and with a larger residual budget
                xx_t = tuple(xx)
                res_b = b - r['cost']
                seen_budgets = seen.get(xx_t)
                if seen_budgets is not None and seen_budgets[0] >= res_b:
                    continue

                # update budgets spent
                updated_budget_units_spent = budget_units_spent.copy()
                updated_budget_units_spent[f] = updated_budget_units_spent.get(f,0) + r['cost']
                # add to frontier and to past seen elements
                seen[xx_t] = [res_b, updated_budget_units_spent]
                queue.append([xx, res_b, updated_budget_units_spent])
    
    perturbations_df = pd.DataFrame.from_records(list(seen.keys()), columns=x.index.values)
    perturbations_df = perturbations_df.drop_duplicates()
    return perturbations_df

In [4]:
def perturb_dataset(data, budget, max_budget_per_feature, rules, skip_class=None):
    """
    Returns the dataset extended with all instance perturbations.

    This function takes as input a dataset and returns another dataset which is obtained from the original
    by adding all the possible perturbations an attacker with budget B can apply to every instance.

    Parameters
    ----------
    data : pandas.DataFrame
        The original dataset
    rules : list
        The list of modification rules
    budget : float
        The attacker's budget
    max_budget_per_feature : dict
        The maximum allowed amount of budget units that can be spend on each feature
    costs : dict
        A mapping between each feature and its cost of perturbation
    skip_class : int
        if class (i.e. last columns) equals skip_class, then instance is skipped

    Returns
    -------
    pandas.DataFrame
        The perturbed dataset

    """
    if data is None or data.empty:
        return # if not, just return None
    
    # compute valid thresholds
    thresholds = {c:np.unique(data[c]) for c in data.columns}

    # prepare the perturbed dataset to be returned, initially empty with an extra "instance_id" column
    cols = ["instance_id"] + data.columns.tolist()
    perturbed_data = pd.DataFrame(columns=cols)
    
    # start with instance_id = 1
    instance_id = 1
    perturbations = None
    
    # loop through every single instance in the original dataset
    print("***** Loop through all the original instances... *****")
    for index, instance in data.iterrows():
        if instance_id%500==0:
            print("***** Perturbing instance [ID = #{}]... *****".format(instance_id))  
        
        if skip_class is not None and instance[-1]==skip_class:
            # keep the original instance only
            perturbations = pd.DataFrame([instance])
        else:
            # apply perturbations
            perturbations = perturb_instance(x=instance, rules=rules, budget=budget, 
                                             max_budget_per_feature=max_budget_per_feature,
                                             thresholds=thresholds)
            
        perturbations.insert(loc=0, 
                             column="instance_id", 
                             value=[instance_id for i in range(perturbations.shape[0])], 
                             allow_duplicates=True)

        perturbed_data = perturbed_data.append(perturbations)
        instance_id += 1
        
    # eventually, return the perturbed dataset
    print("***** Return the final perturbed dataset *****")
    return perturbed_data

# Wine Dataset

In [5]:
DATASET_NAME="wine2"

colnames = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
            'total_sulfur_dioxide', 'density', 'pH', 'sulphites', 'alcohol', 'is_white', 'quality']

In [6]:
PATH="../data/{}".format(DATASET_NAME)
TRAINING_SET="train.csv"
VALIDATION_SET="valid.csv"
TEST_SET="test.csv"
N_TRAIN_INSTANCES=None # replace this with None to load the whole training set
N_TEST_INSTANCES=None # replace this with None to load the whole test set

In [15]:
######################
# Attacker Definition
######################
attacker_rules = [
    # alcohol increment rule
    {'f'    :'alcohol',
     'valid': lambda x: x['alcohol'] <= 10.0,
     'value': .75,
     'cost' : 10,
     'is_cat': False },
    {'f'    : 'residual_sugar',
     'valid': lambda x: x['residual_sugar'] >= 8.0,
     'value': -1.2,
     'cost' : 10,
     'is_cat': False },
    {'f'    : 'volatile_acidity',
     'valid': lambda x: x['volatile_acidity'] >= 0.6,
     'value': -0.3,
     'cost' : 10,
     'is_cat': False }    
]

max_budget_per_feature = {
    'alcohol'     : 100,
    'residual_sugar': 100,
    'volatile_acidity'    : 100
}

B = [20, 30, 40]
B = [40]

# Load dataset

In [16]:
# load
train = load_dataset(PATH, TRAINING_SET, names=colnames, header=0, nrows=N_TRAIN_INSTANCES)
valid = load_dataset(PATH, VALIDATION_SET, names=colnames, header=0)
test = load_dataset(PATH, TEST_SET, names=colnames, header=0, nrows=N_TEST_INSTANCES)

# binarize
binarize_data(train, "quality", 6)
binarize_data(valid, "quality", 6)
binarize_data(test, "quality", 6)

In [17]:
print("Shape of training set: {}".format(train.shape))
print("Shape of validation set: {}".format(valid.shape))
print("Shape of test set: {}".format(test.shape))

Shape of training set: (4547, 13)
Shape of validation set: (650, 13)
Shape of test set: (1300, 13)


In [18]:
# train.head()
# valid.head()
# test.head()

# Save normalized and perturbed datasets

In [19]:
serialize_dataset(train, PATH, TRAINING_SET, "_ori.csv.bz2")
serialize_dataset(valid, PATH, VALIDATION_SET, "_ori.csv.bz2")
serialize_dataset(test, PATH, TEST_SET, "_ori.csv.bz2")

In [20]:
for budget in B:
    print ("Processing Budged: ", budget)
    train_att = perturb_dataset(train, budget, max_budget_per_feature, attacker_rules, skip_class=1)
    valid_att = perturb_dataset(valid, budget, max_budget_per_feature, attacker_rules, skip_class=1)
    test_att  = perturb_dataset(test, budget, max_budget_per_feature, attacker_rules, skip_class=1)
    
    serialize_dataset(train_att, PATH, TRAINING_SET, "_B{}".format(budget)+".csv.bz2")
    serialize_dataset(valid_att, PATH, VALIDATION_SET, "_B{}".format(budget)+".csv.bz2")
    serialize_dataset(test_att, PATH, TEST_SET, "_B{}".format(budget)+".csv.bz2")
    
    print("Shape of attacked training set: {}".format(train_att.shape))
    print("Shape of attacked validation set: {}".format(valid_att.shape))
    print("Shape of attacked test set: {}".format(test_att.shape))

Processing Budged:  40
***** Loop through all the original instances... *****
***** Perturbing instance [ID = #500]... *****
***** Perturbing instance [ID = #1000]... *****
***** Perturbing instance [ID = #1500]... *****
***** Perturbing instance [ID = #2000]... *****
***** Perturbing instance [ID = #2500]... *****
***** Perturbing instance [ID = #3000]... *****
***** Perturbing instance [ID = #3500]... *****
***** Perturbing instance [ID = #4000]... *****
***** Perturbing instance [ID = #4500]... *****
***** Return the final perturbed dataset *****
***** Loop through all the original instances... *****
***** Perturbing instance [ID = #500]... *****
***** Return the final perturbed dataset *****
***** Loop through all the original instances... *****
***** Perturbing instance [ID = #500]... *****
***** Perturbing instance [ID = #1000]... *****
***** Return the final perturbed dataset *****
Shape of attacked training set: (2222084, 14)
Shape of attacked validation set: (75813, 14)
Shape 

# Debugging

In [210]:
train_att = perturb_dataset(train, 20, max_budget_per_feature, attacker_rules, skip_class=1)

***** Loop through all the original instances... *****
***** Perturbing instance [ID = #500]... *****
***** Perturbing instance [ID = #1000]... *****
***** Perturbing instance [ID = #1500]... *****
***** Perturbing instance [ID = #2000]... *****
***** Perturbing instance [ID = #2500]... *****
***** Perturbing instance [ID = #3000]... *****
***** Perturbing instance [ID = #3500]... *****
***** Perturbing instance [ID = #4000]... *****
***** Perturbing instance [ID = #4500]... *****
***** Return the final perturbed dataset *****


In [211]:
train_att.shape

(361282, 14)

In [117]:
train_att.shape

(7811, 14)

In [212]:
train.shape

(4547, 13)

In [218]:
u = np.unique(train[['alcohol']]) 
u

array([ 8.        ,  8.4       ,  8.5       ,  8.6       ,  8.7       ,
        8.8       ,  8.9       ,  9.        ,  9.05      ,  9.1       ,
        9.2       ,  9.23333333,  9.25      ,  9.3       ,  9.4       ,
        9.5       ,  9.53333333,  9.55      ,  9.6       ,  9.63333333,
        9.7       ,  9.73333333,  9.75      ,  9.8       ,  9.9       ,
        9.95      , 10.        , 10.03333333, 10.1       , 10.13333333,
       10.15      , 10.2       , 10.3       , 10.4       , 10.46666667,
       10.5       , 10.53333333, 10.55      , 10.56666667, 10.6       ,
       10.65      , 10.7       , 10.75      , 10.8       , 10.9       ,
       10.93333333, 10.96666667, 11.        , 11.05      , 11.06666667,
       11.1       , 11.2       , 11.26666667, 11.3       , 11.33333333,
       11.4       , 11.43333333, 11.45      , 11.46666667, 11.5       ,
       11.55      , 11.6       , 11.63333333, 11.65      , 11.7       ,
       11.73333333, 11.75      , 11.8       , 11.85      , 11.9 

In [199]:
set(u[np.logical_and(u>=8 , u<=8.65)]) | set([8.3,8.65] )
# lower in queue but not in seen
# upper in both queue and in seen

{8.3, 8.4, 8.5, 8.6, 8.65}

In [220]:
len( u[np.logical_and(u>=10 , u<=11.5)] )

34

In [170]:
np.searchsorted(np.unique(train[['alcohol']]), [8.6], side='left') 

array([3])

In [204]:
u,l=sorted([5,4])

In [202]:
u,l


(4, 5)

In [216]:
train.iloc[0,:]

fixed_acidity            8.9000
volatile_acidity         0.5900
citric_acid              0.3900
residual_sugar           2.3000
chlorides                0.0950
free_sulfur_dioxide      5.0000
total_sulfur_dioxide    22.0000
density                  0.9986
pH                       3.3700
sulphites                0.5800
alcohol                 10.3000
is_white                 0.0000
quality                 -1.0000
Name: 0, dtype: float64