# Dataframe anonymization with Pandas

In [1]:
import pandas as pd

fpath = "/home/eolus/Downloads/train_titanic.csv"

X = pd.read_csv(fpath)
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
ANON_FIELDS = [
    'Name',
    'Age',
    'Sex',
    'Fare',
    'Cabin'
]

In [3]:
def pseudonymizer(X, colNames=ANON_FIELDS):
    '''
    Main function anonymize sensitive fields in a Pandas Dataframe.

    Params:
        X : the input Pandas DataFrame
        varList : the list of variable to be pseudonymised (list of str)

    Return:
        X_result : the output Pandas DataFrame
    '''

    def anon_str(X, colName):
        """
        Anonymizes field of string data in a DataFrame
        
        Params:
            X : the input DataFrame
            colName: name of column to be anonymized (str)
        
        Return:
            The output Pandas DataFrame
        """
        
        def hash1(in_str):
            return {}.format(hash("{}".format(in_str)))
        
        def hash2(in_str):
            from hashlib import sha1
            return sha1("{}".format(in_str).encode('utf-8')).hexdigest()
            
        try:
            X[colName] = X[colName].apply(lambda x : hash2(x))
        except Exception as e:
            print(e)
            print("Exception on column :{}; no anonymization".format(colName))
        return X

    def anon_num(X, colName):
        """
        Anonymize a numerical column in a dataframe
        
        params:
            X : the input DataFrame
            colName : name of column to be anonymized
        
        return:
            The output Pandas DataFrame
        """
        
        def num_value_ofuscator(value, mean, variance):
            """
            Anonymization of a numerical value
            given mean and variance (of the field)
            
            params:
                value, some input value of any numeric type
                mean, float
                variance, float
            
            returns:
                'anonymized' value (centered-normalized value), float
            """
            try:
                return float(value - mean) / variance
            except:
                print("Could not anonymize numerical value: {}".format(value))
                print("Defaulting: anonymized_value = mean")
                return float(mean)

        mean = X[colName].mean()
        variance = X[colName].var()
        try:
            X[colName] = X[colName].apply(lambda x : num_value_ofuscator(x, mean, variance))
        except Exception as e:
            print(e)
            print("Column :{}, not anonymized".format(colName))
        return X

    
    def anon_cat(X, colName):
        """
        Anonymize a categorical column in a dataframe
        
        params:
            X : the input DataFrame
            colName : name of column to be anonymized
        
        return:
            The output Pandas DataFrame
        """
        anon_col = X.groupby(colName).ngroup().astype('object')
        X[colName] = anon_col
        return X
    
    
    # ---
    # 0. Make boolean mask to record index of NA values
    # 1. Makes list of column names by type (category, string, numeric)
    # 2. Iterate over each list and apply appropriate anonymizer per type
    # 3. ... get anonymized version of input DataFrame
    # 4. Apply boolean mask to reset anonymized missing value to `np.nan`
    
    # Note: Rule to declare a field as categorical data;
    # 'any field with a number of distinct value below the total number of records
    # is considered a categorical type field (thus is sent to `anon_cat()` for anonymization)
    # ---
    
    # Boolean mask of NA values
    X_NA_mask = X.notna()
    
    # Create lists of column names by dtype
    colNames_cat = [] # Categorical data, 
    colNames_str = [] # String data, column names,
    colNames_num = [] # Numerical data, column names
    
    # Populate list based on dtype detected on read + adhoc rules (*)
    from pandas.api.types import is_string_dtype
    from pandas.api.types import is_numeric_dtype
    
    # multithreading optimization is possible here
    for colName in colNames:
        if (len(X[colName].unique()) < 0.5*len(X)      # distinct values vs. dataset size (*)
            or X[colName].dtype.name == 'category'):
            colNames_cat.append(colName)
        elif is_numeric_dtype(X[colName]):
            colNames_num.append(colName)
        elif is_string_dtype(X[colName]):
            colNames_str.append(colName)
    
    # ...and here...
    for (colNames, anon) in [
        (colNames_num, anon_num),
        (colNames_str, anon_str),
        (colNames_cat, anon_cat)
    ]:
        # ...and here.
        for colName in colNames:
            X = anon(X, colName)
    
    # Replace anonymized NA value with value from filter
    X = X[X_NA_mask]
    
    return X

In [4]:
X_anon = pseudonymizer(X, ANON_FIELDS)
X_anon.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,15ad0f99384bb905c6e24446885e60900cbbfa24,1,28,1,0,A/5 21171,18,,S
1,2,1,1,1c980d28a0aeefcd688d81f6b3fc053ad7b7b967,0,51,1,0,PC 17599,207,81.0,C
2,3,1,3,c9bf698f1f97320e62917e8bd2109b7cc20cdcdc,0,34,0,0,STON/O2. 3101282,41,,S
3,4,1,1,1f3c61887b3f0ef942a84374be8f922da816bdae,0,47,1,0,113803,189,55.0,S
4,5,0,3,63a6a50e3e4ad7e967def958b0b8ab51d889c7bf,1,47,0,0,373450,43,,S
