## Imports and Data Initialization

In [9]:
import pandas as pd

In [10]:
df = pd.read_csv("../data/ground_truth.csv", parse_dates=["date"])
# Need to find a way to remove the additional date 1900-01-01 while keeping datetime format. 
df["hours"]= pd.to_datetime(df["hours"], format = '%H:%M%S')
df.head()

FileNotFoundError: [Errno 2] File b'data/ground_truth.csv' does not exist: b'data/ground_truth.csv'

In [None]:
df["id_item"]=df["id_item"].astype("category")
df["date"]=df["date"].astype("category")
categorical = set ({'date','id_item'})

In [None]:
# Initial MU : 14.1+ MB
df.info()

In [None]:
#Quasi identification sur base de date et d'heure
df[(df["date"]=="2010-12-01") & (df["hours"]=="1900-01-01 08:02:06")]

In [None]:
# Number of unique users

df["id_user"].nunique()

In [None]:
# The quantity bought by user 17850 is very often 6 (215/297=72%)

#print(df[df["id_user"]==12680].head(1000).to_string())
df[df["id_user"]==17850]["qty"].value_counts()

In [None]:
# The item 84406B is most bought by user 17850 (15/176 = 8% vs 0.9% mean per other user so more than x8 times )

df[df["id_item"]=="84406B"]["id_user"].value_counts()
#df[df["id_item"]=="84406B"]["id_user"].value_counts().mean()

In [None]:
# The user 12688 is the one who most bought at the date 2011-08-18

df[df["date"]=="2011-08-18"]["id_user"].value_counts()

In [None]:
{df["qty"].min(),df["qty"].max()}

# Implementation of k-anonymity

In [None]:
def get_spans(df,partition, scale = None):
    spans={}
    for column in df.columns:
        if column in categorical:
            span=len(df[column][partition].unique())
        else:
            span=df[column][partition].max()-df[column][partition].min()
        if scale is not None:
            span = span/scale[column]
        spans[column]=span
    return spans

In [None]:
full_spans = get_spans(df,df.index)
full_spans

In [None]:
def split(df, partition, column):
    
    """""
    :param     df: The dataframe from which we want to split a partition
    :param     partition: The range of indexes (rows) we want to select from df 
    :param     column: The column to select from df, and operate the split with
    
    Example : 
    
    index         age(numerical)   gender(categorical)
      0            19              female
      1            25              female
      2            14              male
      3            51              female

      
      If column = age 
      Median = (S[2]+S[1])/2 = (14+25)/2 = 19.5 ; Sorted age : {14,19,25,51} = {S[2],S[0],S[1],S[3]}
      return (Range{2,0},Range{1,3})
      
      If column=gender (It is precisely because of even cardinals that we can't calculate medians for non numerical)
      values = {gender,female}
      return (Range{2},Range{0,1,3}) 
      
      As shown above, cases where the partition is split on a non numerical column, can result in a weird situation
      where left and right partition don't really have the same number of values.
      And this is only a showcase situation, it is virtually possible to have 0 value at left, and all values at right
      (which also happens when the quasi identifier has only one value in the partition : card(values)=1)
      or 75% at left and 25% at right etc... 
      
      So really for categorical values, the notion of median (which is supposed to split the set in 50%) doesn't make
      any sens here.

    """""
    
    
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                                            dfp = df[column][partition]
                                            
                          df[column] is a set (like an array one dimension, index : value)
                          df[column][partition] is a subset of df[column] (selection of rows)
                          It is also treated as a set
                          
                            Example: 
                            
                            df[age] = 0 19 = df[df.index]
                                      1 25
                                      2 14
                                      3 51
                            
                          df[column][0] = 19 
                          df[column][[0,1]]= 19
                                             25
                          [0,1] can be viewed as the range of indexes from 0 to 1 included
                          df.index is the range of all indexes from 0 to df.len-1
                          partition is nothing more than the set of indexes of a given number of rows
                            
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
   
     
    
    dfp = df[column][partition]
    
    if ((column in categorical) | (column == "date") | (column == "hours")):
        values = dfp.unique()
        lv=set(values[:len(values)//2])
        rv=set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)],dfp.index[dfp.isin(rv)]
    else:
        median=dfp.median()
        #print(dfp, median)
        dfl=dfp.index[dfp<median]
        dfr=dfp.index[dfp>=median]
        #print(df["qty"][dfl])
        return(dfl,dfr)

In [None]:
39/2

In [None]:
dfl,dfr = split(df,df.index,"id_item")
dfl,dfr

In [None]:
# If partition has less members than k then it's not a valid one, so we can't further divide it
def is_k_anonymous (df, partition, sensitive_column, k=5000):
    if len(partition)<k:
        return False
    return True

In [None]:
def partition_dataset(df, feature_columns,sensitive_column, scale, is_valid):  
    """""
    :param     df: The dataframe to partition
    :param     feature_columns: The Quasi-Identifier columns 
    :param     sensitive_column : The Sensitive Data we wish to protect
    :param     scale : original full_spans of df before the first split
    
    """""
    finished_partitions=[]
    
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                                            [df.index]
                                            
                    df.index is a range of indexes, it can be seen as an array of indexes while it's not 
                    technically an array.
                    [df.index] in between brackets simply means you're initiliazing a list of index ranges, 
                    you can see it as an array of arrays.
                            
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    partitions=[df.index]
    
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                                            while partitions:
                                            
                    The main loop here works like a chef, think of partitions like a cucumber, 
                    each time we're going through the while, chef cuts it in two equal parts (more or less **)
                                    (i1) <----> => (i2) <--><--> =>  (i3) <-><-><-><->
                                        PS: Sorry for the poor drawing skills
                        ** Refer to the split method to understand this remarque.
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    while partitions:
        
        # Pop the oldest partition from the list of partitions, so that we can split it further (or at least try).
        # Remember partition is the same type as df.index so it's a range of indexes (an "array" of indexes)
        
        partition = partitions.pop(0)
        
        # Update the spans (number of unique values for each quasi-identifier column) for this iteration
        
        spans = get_spans(df[feature_columns],partition, scale)
        
        """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                            for column, span in sorted(spans.items(), key = lambda x:-x[1]):
                    
                    for {column,span} in {column1 : numberOfUniqueItemInColumn_1    
                                          column2 : numberOfUniqueItemInColumn_2   
                                          ....
                                          columnN : numberOfUniqueItemInColumn_N   
                                          } 
                    where numberOfUniqueItemInColumn_i is a drecreasing sequence (suite décroissante) 
                    
                    Here we loop through the quasi identifiers (since span stores the quasi identifier column
                    names and values) ordered in decreasing order of multiplicity (number of # values).
                    
                    
                    Details :
                    
                    spans.items() returns a hashmap of spans column names as keys, and span column values as values
                    
                    key = lambda x:-x[1] tells to sorted(), hey I want you to sort this hashmap in decreasing order 
                    of the elements that are in [1] so in decreasing values in this case. 
                    (key = labmda x:x[0] would've returned a sorted hashmap in increasing order of indexes which 
                    is the default)
             
                    
        """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    
        for column, span in sorted(spans.items(), key = lambda x:-x[1]):
            
            # lp and rp are like partition : they are a range of indexes (an "array" of indexes)
            # lp is the ranges of indexes for whome df[column][lp] < median
            # rp is the ranges of indexes for whome df[column][rp] >= median

            lp, rp = split(df,partition,column)
            
            """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                    if not is_valid(df,lp,sensitive_column) or not is_valid(df,rp,sensitive_column):
                                            
                    As long as one of both split partitions is still valid (len(partition)>k) we need
                    to break from the for loop (don't even look for next quasi identifier) and add lp and rp 
                    to the list of partitions we want to cut even more.
                    
                    Otherwise we enter the if, and execute continue : 
                    What continue does, is forget the rest of the loop and skip to the next iteration,
                    so select the next quasi identifier to divide the partition.
                    
                    If we tried with all quasi identifiers, and none worked, then our partition is done (as
                    small as possible) and when we execute continue it will have no more quasi to explore. 
                    So executes the else and append the finished partition to the set of finished_partitions.
        
                            
            """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
            if not is_valid(df,lp,sensitive_column) or not is_valid(df,rp,sensitive_column):
                continue
            partitions.extend((lp,rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions

In [None]:
feature_columns=["date", "qty"]
sensitive_column="price"
finished_partitions=partition_dataset(df,feature_columns,sensitive_column, full_spans, is_k_anonymous)

In [None]:
len(finished_partitions)
#df["date"][finished_partitions[0]]
#finished_partitions[0]
#finished_partitions

In [None]:
len(df)/305
df["qty"][finished_partitions[5]].value_counts()

In [None]:
"""
Various handlers for .agg() method to call on each column of df.loc[partition] depending on the column type

:param   series: Column Set (from df.loc[partition]) to be processed 
"""

def agg_categorical_column(series):
    return[','.join(set(series))]

def agg_numerical_column(series):
    return[series.mean()]

def agg_date_column(series):
    # remove the hh:mm:ss
    #series=series.dt.date
    if(isinstance(series,pd.Series)):
        print(series[series.idxmin], series[series.idxmax])
        return[[series[series.idxmin], series[series.idxmax]]]
    #return [[series.max(),series.min()]]
    return ["something"]


In [None]:
def build_anonymized_dataset(df,partitions,feature_columns,sensitive_column,max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            if column == "date":
                aggregations[column]=agg_date_column
            else:
                aggregations[column]=agg_categorical_column
            
        else: 
            aggregations[column]=agg_numerical_column
            
    rows=[]
    for i, partition in enumerate(partitions):
        if i%100==1:
            print("Finished {} partitions ! ".format(i) )
        if max_partitions is not None and i > max_partitions:
            break
        #df.agg({column : method_to_apply})
        #df.agg({numerical_column : agg_numerical_column
        #         categorical_column : aggww_categorical_column
        #        })
        # Result 1 line multiple column, each cell=aggreg result
        
        #print(aggregations)
        #print(df.loc[partition].agg(aggregations,squeeze=False))
        #print(type(df.loc[partition]["date"]))
        grouped_columns=df.loc[partition].agg(aggregations,squeeze=False)
        # Count spans of sensitive column in a partition
        sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({
            sensitive_column : 'count'
        })
        values = grouped_columns.iloc[0].to_dict()
        #print(values)
        for sensitive_value, count in sensitive_counts[sensitive_column].items():
            if count==0:
                continue
            values.update({
                sensitive_column : sensitive_value,
                'count' : count,
            })
            rows.append(values.copy())
    return pd.DataFrame(rows)


In [None]:
dfn=build_anonymized_dataset(df,finished_partitions,feature_columns,sensitive_column)

In [None]:
sectors=dfn.groupby("date")
#dfn["date"].value_counts().head(60)

In [None]:
dfn.to_csv("k_anon:date+qty:price.csv")

In [None]:
dfn

In [None]:
df["date"]

In [None]:
#df["date"].min()-df["date"].max()

In [None]:
df["date"]=pd.DatetimeIndex(df["date"]).month

In [None]:
df[df["date"]=="2010-12-01"]["id_user"].value_counts()

In [4]:
df["id_item"].nunique()

NameError: name 'df' is not defined