In [1]:
import pandas as pd
import numpy as np

THE_MOST_IMPORTANT_K = 32 #Changed this, so i can use it in the split function as well, without any kind of refactoring.

In [2]:
names = (
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
)

In [3]:
categorical = set((
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'sex',
    'native-country',
    'race',
    'income',
))
df = pd.read_csv("adult.data", sep=", ", header=None, names=names, index_col=False, engine='python');# We load the data using Pandas

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
for name in categorical:
    df[name] = df[name].astype('category')

In [6]:
def get_spans(df, partition, scale=None):
    """
    :param        df: the dataframe for which to calculate the spans
    :param partition: the partition for which to calculate the spans
    :param     scale: if given, the spans of each column will be divided
                      by the value in `scale` for that column
    :        returns: The spans of all columns in the partition
    """
    spans = {}
    for column in df.columns:
        if column in categorical:
            span = len(df[column][partition].unique())
        else:
            span = df[column][partition].max()-df[column][partition].min()
        if scale is not None:
            span = span/scale[column]
        spans[column] = span
    return spans

In [7]:
full_spans = get_spans(df, df.index)

In [8]:
def split(df, partition, column):
    """
    :param        df: The dataframe to split
    :param partition: The partition to split
    :param    column: The column along which to split
    :        returns: A tuple containing a split of the original partition
    """
    dfp = df[column][partition]
    if column in categorical:
        # Commenting out the next 4 lines should make the world a better place, but who am i to know this for sure...
        # But it does not... Reality is often disappointing.... -_- 
        values = dfp.unique()
        lv = set(values[:len(values)//2])
        rv = set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
        
        indexes = dfp.value_counts().index
        values = dfp.value_counts().values
        
        zipped = zip(indexes,values)
        sortedvals = np.array(sorted(zipped, key=lambda x: x[1]))
        sortedvalues = np.array(sortedvals[:,1],dtype=int)
        cumulated = np.cumsum(sortedvalues)
        
        bestindex = 0
        for i in range(len(cumulated)):
            if cumulated[i]>=THE_MOST_IMPORTANT_K and (cumulated[-1]-cumulated[i])>=THE_MOST_IMPORTANT_K and cumulated[i]<cumulated[-1]/2:
                bestindex = i
        if cumulated[bestindex+1]>=THE_MOST_IMPORTANT_K and (cumulated[-1]-cumulated[bestindex+1])>=THE_MOST_IMPORTANT_K:
            if np.abs(cumulated[-1]/2-cumulated[bestindex])>np.abs(cumulated[-1]/2-cumulated[bestindex+1]):
                bestindex+=1
        
        lv = set(sortedvals[:bestindex,0])
        rv = set(sortedvals[bestindex:,0])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:        
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        
        if len(dfl) < THE_MOST_IMPORTANT_K or len(dfr) < THE_MOST_IMPORTANT_K:
            median = dfp.mean() #Very clever hack, 200IQ code... xD
        
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        
        return (dfl, dfr)

In [9]:
def is_k_anonymous(df, partition, sensitive_column):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    #COMMENT OUT THE NEXT 4 LINES IF WE DO NOT WANT TO ENFORCE DIFFERENT SENSITIVE COLUMNS IN PARTITIONS
    
    #sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column : 'count'})
    #for sensitive_value, count in sensitive_counts[sensitive_column].items():
        #if count == 0:
            #return False
    
    if len(partition) < THE_MOST_IMPORTANT_K:
        return False
    return True

def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    """
    :param               df: The dataframe to be partitioned.
    :param  feature_columns: A list of column names along which to partition the dataset.
    :param sensitive_column: The name of the sensitive column (to be passed on to the `is_valid` function)
    :param            scale: The column spans as generated before.
    :param         is_valid: A function that takes a dataframe and a partition and returns True if the partition is valid.
    :returns               : A list of valid partitions that cover the entire dataframe.
    """
    finished_partitions = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        #Lil' faster
        if len(partition)<2*THE_MOST_IMPORTANT_K:
            finished_partitions.append(partition)
            continue
        spans = get_spans(df[feature_columns], partition, scale)
        for column, span in sorted(spans.items(), key=lambda x:-x[1]):
            lp, rp = split(df, partition, column)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                continue
            partitions.extend((lp, rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions

In [10]:
feature_columns = ['age', 'education-num','race','native-country','workclass']
sensitive_column = 'income'
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)

In [11]:
# we get the number of partitions that were created
len(finished_partitions)

527

In [12]:
def agg_categorical_column(series):
    return [','.join(set(series))]

def agg_numerical_column(series):
    return [series.mean()]

In [13]:
def build_anonymized_dataset_2(df, partitions, feature_columns, sensitive_column, max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    rows = []
    for i, partition in enumerate(partitions):
        if i % 100 == 1:
            print("Finished {} partitions...".format(i))
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column : 'count'})
        values = grouped_columns.iloc[0].to_dict()
        IsOkay = True
        #COMMENT OUT THE NEXT 3 LINES IF WE DO NOT WANT TO ENFORCE DIFFERENT SENSITIVE COLUMNS IN PARTITIONS
        #for sensitive_value, count in sensitive_counts[sensitive_column].items():
            #if count == 0:
                #IsOkay=False
        if IsOkay:
            for rowind in range(len(df.loc[partition])):
                currow = df.loc[partition].iloc[rowind,:].copy()
                for feature in feature_columns:
                    currow[feature]=grouped_columns[feature][0]
                rows.append(currow.copy())
    return pd.DataFrame(rows)

In [14]:
dfn2 = build_anonymized_dataset_2(df, finished_partitions, feature_columns, sensitive_column)

Finished 1 partitions...
Finished 101 partitions...
Finished 201 partitions...
Finished 301 partitions...
Finished 401 partitions...
Finished 501 partitions...


In [15]:
dfn2

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
1854,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",229842,HS-grad,8.491228,Never-married,Transport-moving,Unmarried,"Asian-Pac-Islander,Other,Black",Male,0,0,45,"South,Germany,Thailand,Vietnam,Dominican-Repub...",<=50K
2130,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",130620,11th,8.491228,Separated,Adm-clerical,Unmarried,"Asian-Pac-Islander,Other,Black",Female,0,0,40,"South,Germany,Thailand,Vietnam,Dominican-Repub...",<=50K
3179,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",216129,12th,8.491228,Never-married,Other-service,Not-in-family,"Asian-Pac-Islander,Other,Black",Female,0,0,40,"South,Germany,Thailand,Vietnam,Dominican-Repub...",<=50K
3376,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",462890,10th,8.491228,Married-civ-spouse,Transport-moving,Husband,"Asian-Pac-Islander,Other,Black",Male,0,0,50,"South,Germany,Thailand,Vietnam,Dominican-Repub...",<=50K
3812,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",70447,HS-grad,8.491228,Married-civ-spouse,Protective-serv,Husband,"Asian-Pac-Islander,Other,Black",Male,0,0,40,"South,Germany,Thailand,Vietnam,Dominican-Repub...",>50K
3909,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",139057,HS-grad,8.491228,Married-civ-spouse,Exec-managerial,Husband,"Asian-Pac-Islander,Other,Black",Male,0,0,84,"South,Germany,Thailand,Vietnam,Dominican-Repub...",>50K
4066,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",288585,HS-grad,8.491228,Married-civ-spouse,Other-service,Wife,"Asian-Pac-Islander,Other,Black",Female,0,0,20,"South,Germany,Thailand,Vietnam,Dominican-Repub...",<=50K
4457,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",93589,HS-grad,8.491228,Divorced,Protective-serv,Own-child,"Asian-Pac-Islander,Other,Black",Male,0,0,40,"South,Germany,Thailand,Vietnam,Dominican-Repub...",<=50K
6316,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",93076,HS-grad,8.491228,Never-married,Other-service,Own-child,"Asian-Pac-Islander,Other,Black",Male,0,0,40,"South,Germany,Thailand,Vietnam,Dominican-Repub...",<=50K
7404,28.929825,"State-gov,Self-emp-inc,Self-emp-not-inc,Never-...",493034,HS-grad,8.491228,Never-married,Other-service,Not-in-family,"Asian-Pac-Islander,Other,Black",Male,13550,0,50,"South,Germany,Thailand,Vietnam,Dominican-Repub...",>50K


In [16]:
import matplotlib.pyplot as plt
import numpy as np

sizes = np.zeros(len(finished_partitions))
for i in range(len(finished_partitions)):
    sizes[i]=finished_partitions[i].shape[0]

plt.figure()
plt.hist(sizes)
plt.savefig(str(THE_MOST_IMPORTANT_K)+"_census.png")

In [17]:
#ONE HOT encoding the whole thing

uniqitems = {}
for col in feature_columns:
    if col in categorical:
        uniqitems[col]=df[col].unique()

rows = []

for rowind in range(len(dfn2)):
    if rowind %100 ==0:
        print(rowind/len(dfn2)*100)
    currow = dfn2.iloc[rowind,:].copy()
    for col in feature_columns:
        if col in categorical:
            values = currow[col].split(',')
            
            for possibleitem in uniqitems[col]:
                if possibleitem in values:
                    currow=currow.append(pd.Series([1/len(values)],[col+'_'+possibleitem]))
                else:
                    currow=currow.append(pd.Series([0],[col+'_'+possibleitem]))
            currow=currow.drop(col)
    rows.append(currow.copy())

0.0
0.3071158748195694
0.6142317496391388
0.9213476244587083
1.2284634992782777
1.5355793740978472
1.8426952489174167
2.149811123736986
2.4569269985565554
2.7640428733761246
3.0711587481956943
3.3782746230152636
3.6853904978348333
3.992506372654402
4.299622247473972
4.606738122293541
4.913853997113111
5.220969871932681
5.528085746752249
5.835201621571819
6.142317496391389
6.449433371210958
6.756549246030527
7.0636651208500965
7.370780995669667
7.677896870489236
7.985012745308804
8.292128620128375
8.599244494947945
8.906360369767514
9.213476244587081
9.520592119406652
9.827707994226222
10.13482386904579
10.441939743865362
10.74905561868493
11.056171493504499
11.363287368324068
11.670403243143639
11.977519117963208
12.284634992782777
12.591750867602347
12.898866742421916
13.205982617241485
13.513098492061054
13.820214366880624
14.127330241700193
14.434446116519764
14.741561991339333
15.048677866158902
15.355793740978472
15.662909615798041
15.970025490617608
16.277141365437178
16.58425724

In [18]:
final_set = pd.DataFrame(rows)

In [19]:
final_set.to_csv(str(THE_MOST_IMPORTANT_K)+"_census.csv")

In [20]:
final_set

Unnamed: 0,age,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,...,native-country_Holand-Netherlands,workclass_State-gov,workclass_Self-emp-not-inc,workclass_Private,workclass_Federal-gov,workclass_Local-gov,workclass_?,workclass_Self-emp-inc,workclass_Without-pay,workclass_Never-worked
0,28.929825,229842,HS-grad,8.491228,Never-married,Transport-moving,Unmarried,Male,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
1,28.929825,130620,11th,8.491228,Separated,Adm-clerical,Unmarried,Female,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
2,28.929825,216129,12th,8.491228,Never-married,Other-service,Not-in-family,Female,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
3,28.929825,462890,10th,8.491228,Married-civ-spouse,Transport-moving,Husband,Male,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
4,28.929825,70447,HS-grad,8.491228,Married-civ-spouse,Protective-serv,Husband,Male,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
5,28.929825,139057,HS-grad,8.491228,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
6,28.929825,288585,HS-grad,8.491228,Married-civ-spouse,Other-service,Wife,Female,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
7,28.929825,93589,HS-grad,8.491228,Divorced,Protective-serv,Own-child,Male,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
8,28.929825,93076,HS-grad,8.491228,Never-married,Other-service,Own-child,Male,0,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
9,28.929825,493034,HS-grad,8.491228,Never-married,Other-service,Not-in-family,Male,13550,0,...,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.2
