# Generalization
Demonstrate generalization mechanism

## Load libraries and define settings

In [1]:
# general dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [2]:
# data-specific
from examples.synthesization.adult_synthesis import PATH_ADULT

In [3]:
# autoreload changes from local files
%load_ext autoreload
%autoreload 2

# pandas show full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

In [5]:
# privacy budget for differentially private algorithms
EPSILON = 1

# other constants
RANDOM_STATE = 42

## Load and process dataset

In [6]:
df_adult = pd.read_csv(PATH_ADULT)
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Generalization

In [11]:
from crnsynth.processing.generalization import NumericGeneralizationMech
df = df_adult.copy()

num_gen_mech = NumericGeneralizationMech(column='age', epsilon=0.1, bins=10, bounds=(18, 90), inverse='truncated_normal', random_state=RANDOM_STATE)
df_gen = num_gen_mech.fit_transform(df)
df_gen.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,4,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,4,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [12]:
df_inv = num_gen_mech.inverse_transform(df_gen)
df_inv.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,35,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,54,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,51,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,27,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [13]:
# def test_generalization():
mech = NumericGeneralizationMech(
    column="test", epsilon=1.0, bins=5, bounds=(0, 10), inverse="truncated_normal", random_state=RANDOM_STATE
)
data = pd.DataFrame({"test": [0, 2, 5, 11, 8, np.nan, 6]})
mech.fit(data)
assert np.allclose(mech.bin_edges_, [0, 2, 4, 6, 8, 10])

transformed_data = mech.transform(data)
print(transformed_data)
assert transformed_data.loc[~transformed_data.test.isna(), "test"].between(0, 4).all(), 'Transformed values should be between 0 and 4'
assert transformed_data.test.isna().sum() == 1, 'One value should be NaN'


inverse_transformed_data = mech.inverse_transform(transformed_data)
assert inverse_transformed_data["test"][~inverse_transformed_data.test.isna()].between(0, 10).all(), 'Inverse transform should be between 0 and 10'


   test
0   0.0
1   0.0
2   2.0
3   4.0
4   3.0
5   NaN
6   2.0
