# In this notebook I add noise to all the adult validation datasets created

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from pathlib import Path
import os
import sys
import bz2
import pickle
np.random.seed(10)
# adds the visibility of the mlem module, needed to load the attack models
sys.path.append("../../../") 
import mlem

Functions used to generate the noise

In [2]:
from typing import Callable
from numpy import array
def frequency_based_noise(column, size):
    """
    Sample values from a column with replacement.

    Args:
        column: column to sample from
        size: number of elements to sample

    Returns:
        Array of samples
    """
    return column.sample(size, replace=True).to_numpy()

def insert_noise_categorical(dataset: pd.DataFrame, perc: float = 0.1,
                             noise_generating_function: Callable[[pd.Series, int], array] = frequency_based_noise):
    """
    Insert noise in a categorical dataset and returns the dataset passed as argument.
        
    Args:
        dataset (DataFrame): dataset on which to insert the noise ( it should only contain categorical variables )
        perc (float): percentage of noise in the range [0,1]
       noise_generating_function (Callable[[int], array]): function used to generate the noise, must take as input the number of noisy values to
                                   generate inside an argument named size and return an array containing the random values.

    Returns:
        dataset
    """
    n_rows, n_col = dataset.shape
    percentage = int(perc * n_rows)

    for c in range(n_col):
        index_to_replace = np.random.choice(dataset.index,
                                            size=percentage)
        new_values = noise_generating_function(dataset[dataset.columns[c]], size=percentage)
        assert (len(index_to_replace) == len(new_values))
        for ind, val in zip(index_to_replace, new_values):
            dataset.iloc[ind, c] = val
    return dataset

In [3]:
def insert_noise_numerical(dataset: pd.DataFrame, perc: float = 0.1,
                           noise_generating_function: Callable[[int], array] = np.random.normal):
    """
    Insert noise in a numerical dataset and returns the dataset passed as argument.

    Args:
        dataset (DataFrame): dataset on which to insert the noise.
        perc (float): percentage of noise in the range [0,1]
        noise_generating_function (Callable[[int], array]): function used to generate the noise, must take as input the number of noisy values to
                                   generate inside an argument named size and return an array containing the random values.


    Examples:

        >>> df = pd.DataFrame(data={'col1': 10 * [1], 'col2': 10 * [2], 'col3': 10 * [3]})
        >>> df[NUMERICAL] = insert_noise_numerical(df[NUMERICAL].copy(), perc=0.1, noise_generating_function=np.random.rand) # note np.random.rand has a size parameter

    """
    n_rows, n_col = dataset.shape
    percentage = int(perc * n_rows)

    for c in range(n_col):
        index_to_replace = np.random.choice(dataset.index,
                                            size=percentage)
        new_values = noise_generating_function(size=percentage)
        assert (len(index_to_replace) == len(new_values))
        for ind, val in zip(index_to_replace, new_values):
            dataset.iloc[ind, c] = val
    return dataset


Paths of all the validation datasets on which to insert the noise

In [4]:
PATHS = []
for k in range(2,7):
    PATHS.append(Path(f"adult_no_target_division/K{k}/validation_k{k}.csv"))
PATHS.append(Path("adult_randomforest_and_datasets/adult_validationset.csv"))
assert all(map(lambda x: x.is_file(), PATHS))

for p in PATHS:
    print(p)

adult_no_target_division/K2/validation_k2.csv
adult_no_target_division/K3/validation_k3.csv
adult_no_target_division/K4/validation_k4.csv
adult_no_target_division/K5/validation_k5.csv
adult_no_target_division/K6/validation_k6.csv
adult_randomforest_and_datasets/adult_validationset.csv


In [5]:
NUMERICAL_FEATURES = ['Age', 'Fnlwgt', 'Education-num', 'Capital-gain', 'Capital-loss', 'Hours-per-week']
CATEGORICAL_FEATURES = ['Relationship','Native-country','Workclass','Sex','Marital-status','Education','Occupation','Race']

In [6]:
NUMERICAL_NOISE_PERC = 0.1
CATEGORICAL_NOISE_PERC = 0.1

Adding the noise and saving the datasets in the same folder of the clean dataset

In [7]:
for p in PATHS:
    clean = pd.read_csv(p)
    clean[NUMERICAL_FEATURES] = insert_noise_numerical(clean[NUMERICAL_FEATURES].copy(), perc=NUMERICAL_NOISE_PERC)
    clean[CATEGORICAL_FEATURES] = insert_noise_categorical(clean[CATEGORICAL_FEATURES].copy(), perc=CATEGORICAL_NOISE_PERC)
    new_name = p.name.replace(".csv", "-noisy.csv")
    new_path = p.parent / new_name
    clean.to_csv(new_path, index=False)