In [1]:
import pandas as pd
import numpy as np

THE_MOST_IMPORTANT_K = 64 #Changed this, so i can use it in the split function as well, without any kind of refactoring.

In [2]:
df = pd.read_csv("./titanic/train.csv", sep=",", header=0, index_col=False, engine='python');# We load the data using Pandas

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(columns=["PassengerId", "Name"], inplace=True) # dropped because unique for every row
df.drop(columns=["Ticket", "Cabin"], inplace=True) # dropped because almost unique for every row
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
categorical = [
    'Survived',
    'Pclass',
    'Sex',
    'SibSp',
    'Parch',
    'Embarked'
]

for name in categorical:
    df[name] = df[name].astype('category')
    
cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']
df = df[cols]
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


In [7]:
def get_spans(df, partition, scale=None):
    """
    :param        df: the dataframe for which to calculate the spans
    :param partition: the partition for which to calculate the spans
    :param     scale: if given, the spans of each column will be divided
                      by the value in `scale` for that column
    :        returns: The spans of all columns in the partition
    """
    spans = {}
    for column in df.columns:
        if column in categorical:
            span = len(df[column][partition].unique())
        else:
            span = df[column][partition].max()-df[column][partition].min()
        if scale is not None:
            span = span/scale[column]
        spans[column] = span
    return spans

In [8]:
full_spans = get_spans(df, df.index)
full_spans

{'Age': 79.58,
 'Embarked': 3,
 'Fare': 512.3292,
 'Parch': 7,
 'Pclass': 3,
 'Sex': 2,
 'SibSp': 6,
 'Survived': 2}

In [9]:
def split(df, partition, column):
    """
    :param        df: The dataframe to split
    :param partition: The partition to split
    :param    column: The column along which to split
    :        returns: A tuple containing a split of the original partition
    """
    dfp = df[column][partition]
    if column in categorical:
        # Commenting out the next 4 lines should make the world a better place, but who am i to know this for sure...
        # But it does not... Reality is 
        values = dfp.unique()
        lv = set(values[:len(values)//2])
        rv = set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
        
        indexes = dfp.value_counts().index
        values = dfp.value_counts().values
        
        zipped = zip(indexes,values)
        sortedvals = np.array(sorted(zipped, key=lambda x: x[1]))
        sortedvalues = np.array(sortedvals[:,1],dtype=int)
        cumulated = np.cumsum(sortedvalues)
        
        bestindex = 0
        for i in range(len(cumulated)):
            if cumulated[i]>=THE_MOST_IMPORTANT_K and (cumulated[-1]-cumulated[i])>=THE_MOST_IMPORTANT_K and cumulated[i]<cumulated[-1]/2:
                bestindex = i
        if cumulated[bestindex+1]>=THE_MOST_IMPORTANT_K and (cumulated[-1]-cumulated[bestindex+1])>=THE_MOST_IMPORTANT_K:
            if np.abs(cumulated[-1]/2-cumulated[bestindex])>np.abs(cumulated[-1]/2-cumulated[bestindex+1]):
                bestindex+=1
        
        lv = set(sortedvals[:bestindex,0])
        rv = set(sortedvals[bestindex:,0])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:        
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        
        if len(dfl) < THE_MOST_IMPORTANT_K or len(dfr) < THE_MOST_IMPORTANT_K:
            median = dfp.mean() #Very clever hack, 200IQ code... xD
        
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        
        return (dfl, dfr)

In [10]:
def is_k_anonymous(df, partition, sensitive_column):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    #COMMENT OUT THE NEXT 4 LINES IF WE DO NOT WANT TO ENFORCE DIFFERENT SENSITIVE COLUMNS IN PARTITIONS
    
    #sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column : 'count'})
    #for sensitive_value, count in sensitive_counts[sensitive_column].items():
        #if count == 0:
            #return False
    
    if len(partition) < THE_MOST_IMPORTANT_K:
        return False
    return True

def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    """
    :param               df: The dataframe to be partitioned.
    :param  feature_columns: A list of column names along which to partition the dataset.
    :param sensitive_column: The name of the sensitive column (to be passed on to the `is_valid` function)
    :param            scale: The column spans as generated before.
    :param         is_valid: A function that takes a dataframe and a partition and returns True if the partition is valid.
    :returns               : A list of valid partitions that cover the entire dataframe.
    """
    finished_partitions = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        #Lil' faster
        if len(partition)<2*THE_MOST_IMPORTANT_K:
            finished_partitions.append(partition)
            continue
        spans = get_spans(df[feature_columns], partition, scale)
        for column, span in sorted(spans.items(), key=lambda x:-x[1]):
            lp, rp = split(df, partition, column)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                continue
            partitions.extend((lp, rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions

In [11]:
feature_columns = ['Sex', 'Age', 'SibSp', 'Parch', 'Embarked']
sensitive_column = 'Survived'
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)

In [12]:
# we get the number of partitions that were created
len(finished_partitions)

6

In [13]:
def agg_categorical_column(series):
    series = series.astype(str)
    return [','.join(set(series))]

def agg_numerical_column(series):
    return [series.mean()]

In [14]:
def build_anonymized_dataset_2(df, partitions, feature_columns, sensitive_column, max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    rows = []
    #print(aggregations)
    #print(partitions)
    for i, partition in enumerate(partitions):
        if i % 10 == 1:
            print("Finished {} partitions...".format(i))
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column : 'count'})
        values = grouped_columns.iloc[0].to_dict()
        IsOkay = True
        #COMMENT OUT THE NEXT 3 LINES IF WE DO NOT WANT TO ENFORCE DIFFERENT SENSITIVE COLUMNS IN PARTITIONS
        #for sensitive_value, count in sensitive_counts[sensitive_column].items():
            #if count == 0:
                #IsOkay=False
        if IsOkay:
            for rowind in range(len(df.loc[partition])):
                currow = df.loc[partition].iloc[rowind,:].copy()
                for feature in feature_columns:
                    currow[feature]=grouped_columns[feature][0]
                rows.append(currow.copy())
    return pd.DataFrame(rows)

In [15]:
dfn2 = build_anonymized_dataset_2(df, finished_partitions, feature_columns, sensitive_column)

Finished 1 partitions...


In [16]:
dfn2

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
2,3,female,16.377953,435102,2310,7.9250,"C,Q,S",1
9,2,female,16.377953,435102,2310,30.0708,"C,Q,S",1
10,3,female,16.377953,435102,2310,16.7000,"C,Q,S",1
14,3,female,16.377953,435102,2310,7.8542,"C,Q,S",0
22,3,female,16.377953,435102,2310,8.0292,"C,Q,S",1
24,3,female,16.377953,435102,2310,21.0750,"C,Q,S",0
38,3,female,16.377953,435102,2310,18.0000,"C,Q,S",0
39,3,female,16.377953,435102,2310,11.2417,"C,Q,S",1
43,2,female,16.377953,435102,2310,41.5792,"C,Q,S",1
44,3,female,16.377953,435102,2310,7.8792,"C,Q,S",1


In [17]:
import matplotlib.pyplot as plt
import numpy as np

sizes = np.zeros(len(finished_partitions))
for i in range(len(finished_partitions)):
    sizes[i]=finished_partitions[i].shape[0]

plt.figure()
plt.hist(sizes)
plt.savefig(str(THE_MOST_IMPORTANT_K)+"_titanic.png")

In [18]:
#ONE HOT encoding the whole thing

uniqitems = {}
for col in categorical:
    uniqitems[col]=df[col].unique()

rows = []

for rowind in range(len(dfn2)):
    if rowind %71 ==0:
        print(rowind/len(dfn2)*100)
    currow = dfn2.iloc[rowind,:].copy()
    for col in categorical:
        if col != 'Survived':
            values = str(currow[col]).split(',')
            
            for possibleitem in uniqitems[col]:
                possibleitem = str(possibleitem)
                if possibleitem in values:
                    currow=currow.append(pd.Series([1/len(values)],[col+'_'+possibleitem]))
                else:
                    currow=currow.append(pd.Series([0],[col+'_'+possibleitem]))
            currow=currow.drop(col)
    rows.append(currow.copy())

0.0
9.97191011235955
19.9438202247191
29.91573033707865
39.8876404494382
49.859550561797754
59.8314606741573
69.80337078651685
79.7752808988764
89.74719101123596
99.71910112359551


In [19]:
final_set = pd.DataFrame(rows)

In [20]:
final_set.to_csv(str(THE_MOST_IMPORTANT_K)+"_titanic.csv")

In [21]:
final_set

Unnamed: 0,Age,Fare,Survived,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female,SibSp_1,SibSp_0,...,Parch_0,Parch_1,Parch_2,Parch_5,Parch_3,Parch_4,Parch_6,Embarked_S,Embarked_C,Embarked_Q
0,16.377953,7.9250,1,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
1,16.377953,30.0708,1,0.0,0.0,1.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
2,16.377953,16.7000,1,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
3,16.377953,7.8542,0,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
4,16.377953,8.0292,1,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
5,16.377953,21.0750,0,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
6,16.377953,18.0000,0,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
7,16.377953,11.2417,1,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
8,16.377953,41.5792,1,0.0,0.0,1.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
9,16.377953,7.8792,1,1.0,0.0,0.0,0.0,1.0,0.166667,0.166667,...,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.333333,0.333333,0.333333
