# In this notebook I add noise to all the Diva validation datasets created

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from pathlib import Path
import os
import sys
import bz2
import pickle
np.random.seed(10)
# adds the visibility of the mlem module, needed to load the attack models
sys.path.append("../../../") 
import mlem

Functions used to generate the noise

In [2]:
def insert_noise_numerical_diva(dataset: pd.DataFrame, perc: float = 0.1):
    """
    Insert noise in the Diva dataset. Assumes all columns are numerical and the noise is sampled from a normal distr, with same mean and std of the column.
    
    Args:
        dataset (DataFrame): dataset on which to insert the noise.
        perc (float): percentage of noise in the range [0,1]

    Examples:

        >>> df = pd.DataFrame(data={'col1': 10 * [1], 'col2': 10 * [2], 'col3': 10 * [3]})
        >>> df[NUMERICAL] = insert_noise_numerical(df[NUMERICAL].copy(), perc=0.1, noise_generating_function=np.random.rand) # note np.random.rand has a size parameter

    """
    n_rows, n_col = dataset.shape
    percentage = int(perc * n_rows)

    for c in range(n_col):
        index_to_replace = np.random.choice(dataset.index,
                                            size=percentage)
        mean = dataset[dataset.columns[c]].mean()
        std  = dataset[dataset.columns[c]].std()
          
        new_values = np.random.normal(loc=mean, scale=std, size=percentage)
        assert (len(index_to_replace) == len(new_values))
        for ind, val in zip(index_to_replace, new_values):
            dataset.iloc[ind, c] = val
    return dataset


Paths of all the validation datasets on which to insert the noise

In [3]:
PATHS = [
    Path("diva_outputs/diva_validation.csv")
]

assert all(map(lambda x: x.is_file(), PATHS))

In [4]:
for p in PATHS:
    print(p)

diva_outputs/diva_validation.csv


Adding the noise and saving the datasets in the same folder of the clean dataset

In [5]:
for p in PATHS:
    clean = pd.read_csv(p)
    feature_columns = clean.columns[1:-2]
    
    noised = insert_noise_numerical_diva(clean.copy()[feature_columns])
    
    clean[feature_columns] = noised
    
    
    new_name = p.name.replace(".csv", "-noisy.csv")
    new_path = p.parent / new_name
    clean.to_csv(new_path, index=False)

In [8]:
loaded = np.load("diva_outputs/diva_minmax_randfor_data.npz")