In [13]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from joblib import Parallel,delayed

# Functions

In [6]:
# Functions

def load_dataset(path, 
                 dataset_filename, 
                 sep=",", 
                 header=None,
                 nrows=None,
                 names=None, 
                 index_col=False, 
                 na_values='?'):
    
    return pd.read_csv(path+"/"+dataset_filename, 
                       sep=sep, 
                       header=header,
                       nrows=nrows,
                       names=colnames, 
                       index_col=index_col, 
                       na_values='?')

def binarize_data(data, label, threshold):
    data[label] = np.where(data[label] >= threshold, 1, -1)

def serialize_dataset(p_data, 
                      path, 
                      dataset_filename, 
                      suffix, 
                      sep=",", 
                      compression="bz2", 
                      index=False):
    
    p_data.to_csv(path+"/"+dataset_filename.split(".")[0]+suffix, 
                  sep=sep, 
                  compression=compression, 
                  index=index)

In [49]:

class AttackerRule:
    """
    Class AttackerRule represents a rule of attack.
    """

    def __init__(self, pre_conditions, post_condition, cost, is_numerical=True):
        """
        Class constructor.

        Args:
            pre_conditions (dict): set of pre-conditions which must be met in order for this rule to be applied.
            post_condition (dict): post-condition indicating the outcome of this rule once applied.
            cost (float): cost of rule application.
            is_numerical (boolean): flag to indicate whether the attack specified by this rule operates on
                                    a numerical (perturbation) or a categorical (assignment) feature.
        """
        # pre_conditions = {feature_id: (value_left, value_right)}
        self.pre_conditions = pre_conditions
        # post_condition = {feature_id: new_value}
        self.post_condition = post_condition
        self.cost = cost
        self.is_numerical = is_numerical
        if (not self.is_numerical):
            if type(self.pre_conditions[1])==str:
                # fix single element
                self.pre_conditions = ( self.pre_conditions[0], set( (self.pre_conditions[1],) ) )
            else:
                self.pre_conditions = ( self.pre_conditions[0], set(self.pre_conditions[1]) )

#         self.logger = logging.getLogger(__name__)

#     def __getstate__(self):
#         d = dict(self.__dict__)
#         del d['logger']
#         return d

#     def __setstate__(self, d):
#         if 'logger' in d:
#             d['logger'] = logging.getLogger(d['logger'])
#         else:
#             self.logger = logging.getLogger(__name__)
#         self.__dict__.update(d)

    def get_cost(self):
        """
        Return the cost of this rule.
        """
        return self.cost

    def get_target_feature(self):
        """
        Return the feature (id) targeted by this rule.
        """
        return self.post_condition[0]

    def is_applicable(self, x, numerical_idx):
        """
        Returns whether the rule can be applied to the input instance x or not.

        Args:
            x (numpy.array): 1-dimensional array representing an instance.
            numerical_idx (list): binary array which indicates whether a feature is numerical or not;
                                  numerical_idx[i] = 1 iff feature id i is numerical, 0 otherwise.

        Return:
            True iff this rule is applicable to x (i.e., if x satisfies ALL the pre-conditions of this rule).
        """
        feature_id in self.pre_conditions[0]
        left, right = self.pre_conditions[1]
        if numerical_idx[feature_id]:  # the feature is numeric
            return left < x[feature_id] < right
        else:  # the feature is categorical
            return x[feature_id] in left

    def apply(self, x):
        """
        Application of the rule to the input instance x.

        Args:
            x (numpy.array): 1-dimensional array representing an instance.

        Return:
            x_prime (numpy.array): A (deep) copy of x yet modified according to the post-condition of this rule.
        """
        x_prime = np.copy(x)
        feature_id, feature_attack =self.post_condition
        if self.is_numerical:
            x_prime[feature_id] += feature_attack
        else:
            x_prime[feature_id] = feature_attack
        return x_prime

def load_attack_rules(attack_rules_filename, colnames):

    attack_rules = []

    with open(attack_rules_filename) as json_file:
        json_data = json.load(json_file)
        json_attacks = json_data["attacks"]
        for attack in json_attacks:
            for feature in attack:
                feature_atk_list = attack[feature]
                for feature_atk in feature_atk_list:
                    pre = feature_atk["pre"]
                    post = feature_atk["post"]
                    cost = feature_atk["cost"]
                    is_numerical = feature_atk["is_numerical"]
                    attack_rules.append(
                        #rf.
                        AttackerRule(
                            ( colnames.index(feature), eval(pre) ),
                            ( colnames.index(feature), post),
                            cost=cost,
                            is_numerical=is_numerical
                        )
                    )

    return attack_rules

In [8]:
def perturb_instance(x, rules, budget, max_budget_per_feature, thresholds):
    """
    Returns the set of possible perturbations of a given instance.

    This function takes as input an instance and returns a set of perturbations of that instance, 
    using the specified amount of budget and considering the cost of perturbing each individual feature.

    Parameters
    ----------
    x : pandas.Series
        The original instance
    rules : list
        The list of modification rules
    budget : float
        The attacker's budget
    max_budget_per_feature : dict
        The maximum allowed amount of budget units that can be spend on each feature
    thresholds : dict
        feature -> list of relevant thresholds

    Returns
    -------
    pandas.DataFrame
        The set of perturbations (including the original instance, placed at the very beginning)
    """
    
    # initialize the queue (FIFO) with both the original instance, 
    # the initial budget, and an empty dictionary of budget units spent so far
    queue = [(x, budget, {})]
    # visited perturbations
    seen = { tuple(x): [budget, {}] }
    # initialize the set of perturbations of this instance with the empty list
    # perturbations = []
    
    # loop until the queue is not empty
    while len(queue)>0:
        item = queue.pop() # dequeue the first inserted element
        x = item[0] # get the instance
        b = item[1] # get the residual budget
        budget_units_spent = item[2] # get the dictionary containing the amount of budget spent on each feature, so far
        
        # loop through all the features subject to the set of attack rules
        for r in rules:
            f = r['f']  # feature the rule applies to
            # check budget
            if not( r['cost'] <= b and budget_units_spent.get(f, 0) + r['cost'] <= max_budget_per_feature[f] ):
                continue
            # check validity
            if not r['valid'](x):
                continue
            
            # apply rule to a copy
            x_atks = []
            if r['is_cat']:
                xx = x.copy()
                xx[f] = r['value']
                x_atks += [xx]
            else:
#                 xx = x.copy()
#                 xx[f] += r['value']
#                 x_atks += [xx]
                
                # Evaluate crossing of multiple thresholds
                low,high=sorted([x[f], x[f]+r['value']])
                z = set(thresholds[f][np.logical_and(thresholds[f]>=low, thresholds[f]<=high)])
                z |= set([low,high])
                for zi in z:
                    xx = x.copy()
                    xx[f] = zi
                    x_atks += [xx]

                # we are adding all of this to both seen and queue
                # the smallest element might not be included in seen
                #     as it is not an interesting attack
                # still we are missing managing of rule validity thresholds
#                 print (x)
#                 print (x_prime)
#                 print (thresholds[f])
#                 print (thresholds[f][crossings[0]:crossings[1]])

            # process all atks
            for xx in x_atks:
                # skip if already seen and with a larger residual budget
                xx_t = tuple(xx)
                res_b = b - r['cost']
                seen_budgets = seen.get(xx_t)
                if seen_budgets is not None and seen_budgets[0] >= res_b:
                    continue

                # update budgets spent
                updated_budget_units_spent = budget_units_spent.copy()
                updated_budget_units_spent[f] = updated_budget_units_spent.get(f,0) + r['cost']
                # add to frontier and to past seen elements
                seen[xx_t] = [res_b, updated_budget_units_spent]
                queue.append([xx, res_b, updated_budget_units_spent])
    
    perturbations_df = pd.DataFrame.from_records(list(seen.keys()), columns=x.index.values)
    perturbations_df = perturbations_df.drop_duplicates()
    return perturbations_df

In [9]:
def perturb_thread(x, rules, budget, max_budget_per_feature, thresholds,skip_class,instance_id =-1):
    
    if instance_id%500==0:
        print("***** Perturbing instance [ID = #{}]... *****".format(instance_id))  
        
    if skip_class is not None and instance[-1]==skip_class:
        # keep the original instance only
        perturbations = pd.DataFrame([x])
    else:
            
        perturbations = perturb_instance(x=x, rules=rules, budget=budget, 
                                             max_budget_per_feature=max_budget_per_feature,
                                             thresholds=thresholds) 
            
    perturbations.insert(loc=0,  column="instance_id", 
                            value=[instance_id for i in range(perturbations.shape[0])], 
                            allow_duplicates=True)
    return perturbations

In [10]:
def perturb_dataset(data, budget, max_budget_per_feature, rules, skip_class=None):
    """
    Returns the dataset extended with all instance perturbations.

    This function takes as input a dataset and returns another dataset which is obtained from the original
    by adding all the possible perturbations an attacker with budget B can apply to every instance.

    Parameters
    ----------
    data : pandas.DataFrame
        The original dataset
    rules : list
        The list of modification rules
    budget : float
        The attacker's budget
    max_budget_per_feature : dict
        The maximum allowed amount of budget units that can be spend on each feature
    costs : dict
        A mapping between each feature and its cost of perturbation
    skip_class : int
        if class (i.e. last columns) equals skip_class, then instance is skipped

    Returns
    -------
    pandas.DataFrame
        The perturbed dataset

    """
    if data is None or data.empty:
        return # if not, just return None
    
    # compute valid thresholds
    thresholds = {c:np.unique(data[c]) for c in data.columns}

    # prepare the perturbed dataset to be returned, initially empty with an extra "instance_id" column
    cols = ["instance_id"] + data.columns.tolist()
    perturbed_data = pd.DataFrame(columns=cols)
    
    perturbed_data = Parallel(n_jobs=-1)(delayed(perturb_thread)
                                             (x=instance, rules=rules, budget=budget,
                                                max_budget_per_feature=max_budget_per_feature,
                                                thresholds=thresholds, skip_class=skip_class,
                                                instance_id=instance_id+1) # start from id 1
                                        for instance_id,(index, instance) in enumerate(data.iterrows()))
    
    perturbed_data = pd.concat(perturbed_data)
    
    # eventually, return the perturbed dataset
    print("***** Return the final perturbed dataset *****")
    
    return perturbed_data

# WINE Dataset

In [None]:
DATASET_NAME="wine"

colnames = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
            'total_sulfur_dioxide', 'density', 'pH', 'sulphites', 'alcohol', 'is_white', 'quality']

DATASET_PATH="../data/{}".format(DATASET_NAME)
RAW_DATASET_PATH=DATASET_PATH + "/raw"
ATK_DATASET_PATH=DATASET_PATH + "/attacks"

TRAINING_SET="train.csv.bz2"
VALIDATION_SET="valid.csv.bz2"
TEST_SET="test.csv.bz2"
N_TRAIN_INSTANCES=None # replace this with None to load the whole training set
N_TEST_INSTANCES=None # replace this with None to load the whole test set

######################
# Attacker Definition
######################
attacker_rules = [
    # alcohol increment rule
    {'f'    :'alcohol',
     'valid': lambda x: x['alcohol'] <= 10.0,
     'value': .75,
     'cost' : 10,
     'is_cat': False },
    {'f'    : 'residual_sugar',
     'valid': lambda x: x['residual_sugar'] >= 8.0,
     'value': -1.2,
     'cost' : 10,
     'is_cat': False },
    {'f'    : 'volatile_acidity',
     'valid': lambda x: x['volatile_acidity'] >= 0.6,
     'value': -0.3,
     'cost' : 10,
     'is_cat': False }    
]

max_budget_per_feature = {
    'alcohol'          : 1000,
    'residual_sugar'   : 1000,
    'volatile_acidity' : 1000
}

B = [30, 60]
B = [30]

In [None]:
# load
train = load_dataset(RAW_DATASET_PATH, TRAINING_SET, names=colnames, header=0, nrows=N_TRAIN_INSTANCES)
valid = load_dataset(RAW_DATASET_PATH, VALIDATION_SET, names=colnames, header=0)
test  = load_dataset(RAW_DATASET_PATH, TEST_SET, names=colnames, header=0, nrows=N_TEST_INSTANCES)

# binarize
binarize_data(train, "quality", 6)
binarize_data(valid, "quality", 6)
binarize_data(test, "quality", 6)

print("Shape of training set: {}".format(train.shape))
print("Shape of validation set: {}".format(valid.shape))
print("Shape of test set: {}".format(test.shape))

# train.head()
# valid.head()
# test.head()

In [None]:
serialize_dataset(train, DATASET_PATH, TRAINING_SET, ".csv.bz2")
serialize_dataset(valid, DATASET_PATH, VALIDATION_SET, ".csv.bz2")
serialize_dataset(test, DATASET_PATH, TEST_SET, ".csv.bz2")

In [None]:
for budget in B:
    print ("Processing Budget: ", budget)
    train_att = perturb_dataset(train, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    valid_att = perturb_dataset(valid, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    test_att  = perturb_dataset(test, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    
    serialize_dataset(train_att, ATK_DATASET_PATH, TRAINING_SET, "_B{}".format(budget)+".atks.bz2")
    serialize_dataset(valid_att, ATK_DATASET_PATH, VALIDATION_SET, "_B{}".format(budget)+".atks.bz2")
    serialize_dataset(test_att, ATK_DATASET_PATH, TEST_SET, "_B{}".format(budget)+".atks.bz2")
    
    print("Shape of attacked training set: {}".format(train_att.shape))
    print("Shape of attacked validation set: {}".format(valid_att.shape))
    print("Shape of attacked test set: {}".format(test_att.shape))

# CENSUS Dataset

In [16]:
DATASET_NAME="census"

colnames = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 
            'hours_per_week', 'native_country', 'income_greater_than_50k']

DATASET_PATH="../data/{}".format(DATASET_NAME)
RAW_DATASET_PATH=DATASET_PATH + "/raw"
ATK_DATASET_PATH=DATASET_PATH + "/attacks"

ATKS_FILE="census_attacks.json"

TRAINING_SET="train.csv.bz2"
VALIDATION_SET="valid.csv.bz2"
TEST_SET="test.csv.bz2"
N_TRAIN_INSTANCES=None # replace this with None to load the whole training set
N_TEST_INSTANCES=None # replace this with None to load the whole test set

######################
# Attacker Definition
######################
attacker_rules = [
    {'f'    :'workclass',
     'valid': lambda x: x['workclass'] == 'Never-worked',
     'value': 'Without-pay',
     'cost' : 1,
     'is_cat': True },
    {'f'    : 'marital_status',
     'valid': lambda x:x['marital_status']=='Divorced' or x['marital_status']=='Separated',
     'value': 'Never-married',
     'cost' : 1,
     'is_cat': True },
    {'f'    : 'occupation',
     'valid': lambda x: x['occupation'] != 'Other-service',
     'value': 'Other-service',
     'cost' : 1,
     'is_cat': True },
    {'f'    : 'education_num',
     'valid': lambda x: x['education_num'] >= 13 and x['education_num'] <= 16,
     'value': -1,
     'cost' : 20,
     'is_cat': False },
    {'f'    : 'capital_gain',
     'valid': lambda x: True,
     'value': 2000,
     'cost' : 50,
     'is_cat': False },
    {'f'    : 'hours_per_week',
     'valid': lambda x: True,
     'value': 4,
     'cost' : 100,
     'is_cat': False }
]

max_budget_per_feature = {
    'workclass'     : 90,
    'marital_status': 90,
    'occupation'    : 90,
    'education_num' : 90,
    'hours_per_week': 90,
    'capital_gain'  : 90
}

B = [30, 60] # 90

In [55]:
attacker_rules = load_attack_rules(ATKS_FILE, colnames)

In [56]:
for r in attacker_rules:
    print ( r.__dict__ )

{'pre_conditions': (1, {'Never-worked'}), 'post_condition': (1, 'Without-pay'), 'cost': 1, 'is_numerical': False}
{'pre_conditions': (5, {'Divorced', 'Separated'}), 'post_condition': (5, 'Never-married'), 'cost': 1, 'is_numerical': False}
{'pre_conditions': (6, {'Priv-house-serv', 'Prof-specialty', 'Transport-moving', 'Exec-managerial', 'Handlers-cleaners', 'Farming-fishing', 'Machine-op-inspct', 'Craft-repair', 'Tech-support', 'Protective-serv', 'Armed-Forces', 'Adm-clerical', 'Sales'}), 'post_condition': (6, 'Other-service'), 'cost': 1, 'is_numerical': False}
{'pre_conditions': (4, (13, 16)), 'post_condition': (4, -1), 'cost': 20, 'is_numerical': True}
{'pre_conditions': (10, (0, inf)), 'post_condition': (10, 2000), 'cost': 50, 'is_numerical': True}
{'pre_conditions': (12, (0, inf)), 'post_condition': (12, 4), 'cost': 100, 'is_numerical': True}


In [7]:
# load
train = load_dataset(RAW_DATASET_PATH, TRAINING_SET, names=colnames, header=0, nrows=N_TRAIN_INSTANCES)
test = load_dataset(RAW_DATASET_PATH, TEST_SET, names=colnames, header=0, nrows=N_TEST_INSTANCES)

# remove education string
train = train.drop(['education'], axis=1)
test  = test.drop(['education'], axis=1)
# drop NA
train = train[~train.isnull().any(axis=1)]
test  = test[~test.isnull().any(axis=1)]

# create the missing validation set
train, valid = train_test_split( train, 
                                 test_size=0.1, 
                                 random_state=42, 
                                 stratify=train["income_greater_than_50k"])

print("Shape of training set: {}".format(train.shape))
print("Shape of validation set: {}".format(valid.shape))
print("Shape of test set: {}".format(test.shape))

# train.head()
# valid.head()
# test.head()

Shape of training set: (27144, 14)
Shape of validation set: (3017, 14)
Shape of test set: (15059, 14)


In [8]:
serialize_dataset(train, DATASET_PATH, TRAINING_SET, ".csv.bz2")
serialize_dataset(valid, DATASET_PATH, VALIDATION_SET, ".csv.bz2")
serialize_dataset(test, DATASET_PATH, TEST_SET, ".csv.bz2")

In [None]:
for budget in B:
    print ("Processing Budget: ", budget)
    train_att = perturb_dataset(train, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    valid_att = perturb_dataset(valid, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    test_att  = perturb_dataset(test, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    
    serialize_dataset(train_att, ATK_DATASET_PATH, TRAINING_SET, "_B{}".format(budget)+".atks.bz2")
    serialize_dataset(valid_att, ATK_DATASET_PATH, VALIDATION_SET, "_B{}".format(budget)+".atks.bz2")
    serialize_dataset(test_att, ATK_DATASET_PATH, TEST_SET, "_B{}".format(budget)+".atks.bz2")
    
    print("Shape of attacked training set: {}".format(train_att.shape))
    print("Shape of attacked validation set: {}".format(valid_att.shape))
    print("Shape of attacked test set: {}".format(test_att.shape))

# SPAM

In [None]:
DATASET_NAME="spam2"

colnames = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
       'word_freq_our', 'word_freq_over', 'word_freq_remove',
       'word_freq_internet', 'word_freq_order', 'word_freq_mail',
       'word_freq_receive', 'word_freq_will', 'word_freq_people',
       'word_freq_report', 'word_freq_addresses', 'word_freq_free',
       'word_freq_business', 'word_freq_email', 'word_freq_you',
       'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000',
       'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george',
       'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',
       'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
       'word_freq_technology', 'word_freq_1999', 'word_freq_parts',
       'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting',
       'word_freq_original', 'word_freq_project', 'word_freq_re',
       'word_freq_edu', 'word_freq_table', 'word_freq_conference',
       'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!',
       'char_freq_$', 'char_freq_#', 'capital_run_length_average',
       'capital_run_length_longest', 'capital_run_length_total', 'spam']

DATASET_PATH="../data/{}".format(DATASET_NAME)
RAW_DATASET_PATH=DATASET_PATH + "/raw"
ATK_DATASET_PATH=DATASET_PATH + "/attacks"

TRAINING_SET="train.csv.bz2"
VALIDATION_SET="valid.csv.bz2"
TEST_SET="test.csv.bz2"
N_TRAIN_INSTANCES=None # replace this with None to load the whole training set
N_TEST_INSTANCES=None # replace this with None to load the whole test set

######################
# Attacker Definition
######################
attacker_rules = [
    {'f'    : 'char_freq_!',
     'valid': lambda x: x['char_freq_!']>=1,
     'value': -1,
     'cost' : 15,
     'is_cat': False },
    {'f'    : 'word_freq_remove',
     'valid': lambda x: x['word_freq_remove']>=.5,
     'value': -0.5,
     'cost' : 15,
     'is_cat': False },
    {'f'    : 'char_freq_$',
     'valid': lambda x: x['char_freq_$']>=.1,
     'value': -0.1,
     'cost' : 15,
     'is_cat': False },    
    {'f'    : 'capital_run_length_average',
     'valid': lambda x: x['capital_run_length_average']>=5,
     'value': -1,
     'cost' : 30,
     'is_cat': False }, 
    {'f'    : 'capital_run_length_total',
     'valid': lambda x: x['capital_run_length_total']>=400,
     'value': -50,
     'cost' : 30,
     'is_cat': False }
#     , 
#     {'f'    : 'word_freq_hp',
#      'valid': lambda x: x['word_freq_hp']>=1,
#      'value': -1,
#      'cost' : 30,
#      'is_cat': False }
]

max_budget_per_feature = {
    'char_freq_!'     : 1000,
    'word_freq_remove': 1000,
    'char_freq_$'     : 1000,
    'capital_run_length_average' : 1000,
    'capital_run_length_total': 1000,
    'word_freq_hp'  : 1000
}

B = [30, 60]

In [None]:
# load
train = load_dataset(RAW_DATASET_PATH, TRAINING_SET, names=colnames, header=0, nrows=N_TRAIN_INSTANCES)
valid = load_dataset(RAW_DATASET_PATH, VALIDATION_SET, names=colnames, header=0)
test = load_dataset(RAW_DATASET_PATH, TEST_SET, names=colnames, header=0, nrows=N_TEST_INSTANCES)

# binarize
binarize_data(train, "spam", 0.5)
binarize_data(valid, "spam", 0.5)
binarize_data(test, "spam", 0.5)

print("Shape of training set: {}".format(train.shape))
print("Shape of validation set: {}".format(valid.shape))
print("Shape of test set: {}".format(test.shape))

# train.head()
# valid.head()
# test.head()

In [None]:
serialize_dataset(train, DATASET_PATH, TRAINING_SET, ".csv.bz2")
serialize_dataset(valid, DATASET_PATH, VALIDATION_SET, ".csv.bz2")
serialize_dataset(test, DATASET_PATH, TEST_SET, ".csv.bz2")

In [None]:
for budget in B:
    print ("Processing Budget: ", budget)
    train_att = perturb_dataset(train, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    valid_att = perturb_dataset(valid, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    test_att  = perturb_dataset(test, budget, max_budget_per_feature, attacker_rules, skip_class=None)
    
    serialize_dataset(train_att, ATK_DATASET_PATH, TRAINING_SET, "_B{}".format(budget)+".atks.bz2")
    serialize_dataset(valid_att, ATK_DATASET_PATH, VALIDATION_SET, "_B{}".format(budget)+".atks.bz2")
    serialize_dataset(test_att, ATK_DATASET_PATH, TEST_SET, "_B{}".format(budget)+".atks.bz2")
    
    print("Shape of attacked training set: {}".format(train_att.shape))
    print("Shape of attacked validation set: {}".format(valid_att.shape))
    print("Shape of attacked test set: {}".format(test_att.shape))