# In this notebook I add noise to all the Diva validation datasets created

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from pathlib import Path
import os
import sys
import bz2
import pickle
np.random.seed(10)
# adds the visibility of the mlem module, needed to load the attack models
sys.path.append("../../../") 
import mlem

Functions used to generate the noise

In [2]:
def insert_noise_numerical_diva(dataset: pd.DataFrame, perc: float = 0.1):
    """
    Insert noise in the Diva dataset. Assumes all columns are numerical and the noise is sampled from a normal distr, with same mean and std of the column.
    
    Args:
        dataset (DataFrame): dataset on which to insert the noise.
        perc (float): percentage of noise in the range [0,1]

    Examples:

        >>> df = pd.DataFrame(data={'col1': 10 * [1], 'col2': 10 * [2], 'col3': 10 * [3]})
        >>> df[NUMERICAL] = insert_noise_numerical(df[NUMERICAL].copy(), perc=0.1, noise_generating_function=np.random.rand) # note np.random.rand has a size parameter

    """
    n_rows, n_col = dataset.shape
    percentage = int(perc * n_rows)

    for c in range(n_col):
        index_to_replace = np.random.choice(dataset.index,
                                            size=percentage)
        
        new_values = np.random.normal(size=percentage)
        assert (len(index_to_replace) == len(new_values))
        for ind, val in zip(index_to_replace, new_values):
            dataset.iloc[ind, c] = val
    return dataset


Paths of all the validation datasets on which to insert the noise

In [3]:
PATHS = [
    Path("diva_outputs_26_feat_stdscal/diva_std_validation.csv")
]

assert all(map(lambda x: x.is_file(), PATHS))

In [4]:
for p in PATHS:
    print(p)

diva_outputs_26_feat_stdscal/diva_std_validation.csv


Adding the noise and saving the datasets in the same folder of the clean dataset

In [10]:
for p in PATHS:
    clean = pd.read_csv(p)
    feature_columns = clean.columns[:-2]
    noised = insert_noise_numerical_diva(clean.copy()[feature_columns])
    clean[feature_columns] = noised 
    new_name = p.name.replace(".csv", "-noisy.csv")
    new_path = p.parent / new_name
    clean.to_csv(new_path, index=False)

In [16]:
noisy = pd.read_csv("diva_outputs_26_feat_stdscal/diva_std_validation-noisy.csv")
normal = pd.read_csv("diva_outputs_26_feat_stdscal/diva_std_validation.csv")
(noisy == normal).mean() # 90% of the entries are equal, so the 10% has noise in it

FLG_PRES_RISCOSS      0.902748
IMP_V_AGG_IVA         0.901903
SOLVIBILITA           0.906977
VOL_AFF_DICH          0.904017
PESO_ADESIONE         0.904017
TIPO_DICH_ACCERT      0.905708
IMP_ESISTZ            0.906131
DETR_IVA_ACC          0.904017
VAL_ALIQ_MEDIA_ACQ    0.902748
FLG_PRES_RICORSO      0.904017
STATO_CONTROLLO       0.902748
VAL_ALIQ_M_ACQ_IMP    0.904017
FLG_VC                0.906554
IMP_V_AGG_IMPON       0.903594
VAR_RIMBORSO          0.904863
VAL_ALIQ_MEDIA_VOL    0.904440
IMP_IMPST_CREDIT      0.904440
IMP_ACQ_NOIMP         0.903171
COD_ATTIV_GEN         0.904863
IVA_OP_IMPON_DIC      0.904017
IMP_BEN_AMM           0.906131
IMP_ECC_PREC          0.904440
FLG_PRES_BILANCIO     0.904440
imp_tot_pos           0.906131
VAR_DETRAZIONE        0.905708
MAG_IMP_RIT_ACC       0.905285
Target                1.000000
Cluster               1.000000
dtype: float64