In [1]:
import pandas as pd
import numpy as np
import copy as cp
import darc_core
import sys
import hashlib as hs
import base64 as b64
import os
import random as rd
import datetime as dt
from darc_core.metrics import Metrics
from darc_core.preprocessing import round1_preprocessing
from darc_core.utils import check_format_trans_file

In [2]:
%%time
df = pd.read_csv("./ground_truth.csv", parse_dates=["date"])
df["hours"]= pd.to_datetime(df["hours"], format = '%H:%M%S').dt.hour
df["month"]=df["date"].dt.month
df["year"]=df["date"].dt.year
df["id_user"]=df["id_user"].astype(str)
df["price"]=df["price"].astype("float")
df["qty"]=df["qty"].astype("float")
df_dict = df.T.to_dict('list')
df.head()

CPU times: user 23.4 s, sys: 566 ms, total: 24 s
Wall time: 24.5 s


In [3]:
%%time
def mergeIdentical(df,df_dict):
    """ Merge "same" transaction into one line. Assign "DEL" to duplicates
        Transactions are "identical" if their keys are equal
        
        :map: key:[id_user,date,id_item] value:[qty_cumulative,count]
    """
    map={}
    for d in range(0,len(df)):
        if(map.get((df["id_user"][d],df["date"][d],df["id_item"][d]))):
            df_dict[d][0]="DEL"
            df_dict[d][1]=df_dict[d][1].strftime("%Y-%m-01")
            df_dict[d][2]=""
            df_dict[d][3]=""
            df_dict[d][4]=""
            df_dict[d][5]=""
        else:
            map[(df["id_user"][d],df["date"][d],df["id_item"][d])]=[df.loc[d]["qty"],1]
    
    dfn = pd.DataFrame.from_dict(df_dict,orient="index")
    dfn = dfn.rename(columns={0:"id_user",1:"date",2:"hours",3:"id_item",4:"price",5:"qty",6:"month",7:"year"})
    return dfn,map

dfn,map=mergeIdentical(df,df_dict)
save_dfn_merge=dfn.copy()

CPU times: user 2min, sys: 990 ms, total: 2min 1s
Wall time: 2min 5s


In [5]:
def split(df, partition, column): 
    """
        Split df[partition] over column on median 
        :return: List[[IndexRange]]*2 {left ,right}
    """
    dfp = df["id_user"][partition]
    sanitizedPartition = dfp.index[dfp != "DEL"]
    dfp = df[column][sanitizedPartition]
    if (column in categorical):
        values = dfp.unique()
        dfp.sort_values(inplace=True)
        lv=set(values[:len(values)//2])
        rv=set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)],dfp.index[dfp.isin(rv)]
    else:
        median=dfp.median()
        dfp.sort_values(inplace=True)
        dfl=dfp.index[dfp<median]
        dfr=dfp.index[dfp>=median]
        return(dfl,dfr)

In [6]:
def is_k_anonymous (df, partition, k=1):
    if ((len(partition)<k)):
        return False
    return True

In [24]:
%%time
dfn["id_item"]=dfn["id_item"].astype("category")
categorical = set ({'id_item'})
def partition_dataset(df, is_valid):
    """
        Greedy search
        Split dataframe over {year,month,id_item} 
        :return: List[[IndexRange]] 
    """
    finished_partitions=[]
    partitions=[df.index]
    while partitions:    
        partition = partitions.pop(0)
        columns = ["year","month", "id_item"]
        for column in columns:
            lp, rp = split(df,partition,column)
            if not is_valid(df,lp) or not is_valid(df,rp):
                continue
            partitions.extend((lp,rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions
finished_partitions = partition_dataset(dfn, is_k_anonymous)
save_finished_partitions=finished_partitions.copy()

CPU times: user 11min 47s, sys: 4.45 s, total: 11min 52s
Wall time: 11min 57s


In [29]:
%%time
dfn["id_item"]=dfn["id_item"].astype("category")
categorical = set ({'id_item'})
finished_partitions=save_finished_partitions
def make_l_diverse(df,partitions,l=9):
    """
         DEL partitions with l-diversity < l
    """
    i=0
    filtered=[]
    while(len(partitions)):
        if (df.loc[partitions[0],"id_user"].nunique()<l):
            df.loc[partitions[0],"id_user"]="DEL"
            if(isinstance(df.loc[partitions[0],["date"]].squeeze(),dt.date)):
                df.loc[partitions[0],["date"]]=df.loc[partitions[0],["date"]].squeeze().strftime('%Y-%m-01')
            else:
                df.loc[partitions[0],["date"]]=df.loc[partitions[0],["date"]].squeeze().apply(lambda x: x.strftime('%Y-%m-01'))
            df.loc[partitions[0],["hours","id_item","price","qty",]]="" 
            partitions.pop(0)           
        else:
            filtered.append(partitions.pop(0))       
    return dfn, filtered
dfn, finished_partitions = make_l_diverse(dfn,finished_partitions)
save_diversity=finished_partitions.copy()

CPU times: user 31min 40s, sys: 34.2 s, total: 32min 14s
Wall time: 33min 14s


In [30]:
def shuffleDates(df,partition):
    """
         For same partition, swap {dates,qty} randomly.
    """
    date_array=df.loc[partition,"date"].copy().squeeze()
    qty_array=df.loc[partition,"qty"].copy().squeeze()
    date_array.reset_index(drop=True,inplace=True) 
    qty_array.reset_index(drop=True,inplace=True)
    date_shuffled=[]
    qty_shuffled=[]
    while(len(date_array)):
        i=rd.randrange(len(date_array))
        date_shuffled.append(date_array[i])
        date_array.drop(i,inplace=True)
        date_array.reset_index(drop=True,inplace=True)
        qty_shuffled.append(qty_array[i])
        qty_array.drop(i,inplace=True)
        qty_array.reset_index(drop=True,inplace=True)
    df.loc[partition,"date"]=date_shuffled
    df.loc[partition,"qty"]=qty_shuffled
    return df

In [None]:
%%time
def build_anonymized_dataset(df,partitions):
    """
        Generalization of Columns {dates,qty,hours}
    """
    for p in partitions:
        df.loc[p,["hours"]]=df.loc[p,["hours"]].mean().values[0]
        df=shuffleDates(df,p)
    return df
dfn=build_anonymized_dataset(dfn,finished_partitions)
save_dfn=dfn.copy()

In [None]:
%%time
dfn.id_user=dfn.id_user.astype(str)
dfn_copy=dfn.copy()

def generateSalts():
    """ 
        Generate Salt{256b} Table : Mij = Salt for user i in month j 
    """
    # Set of {1..12} (number of the month)
    ordered_months = list(range(1,13))
    # Set of all unique user's IDs
    ids = dfn_copy["id_user"].unique()
    ids = ids[ids != "DEL"]
    # Generate salt table
    salt_table=pd.DataFrame(columns=ordered_months,index=ids)
    salt_table.set_index(ids,inplace=True)
    for i in ids:
        for j in ordered_months:
            #Â Generates unique b64 values
            salt_table[j][i]=os.urandom(256)
    return salt_table

salt_table = generateSalts()
salt_table.head()

In [None]:
salt_table

In [None]:
def hashme(id_user, month): 
    """
        :return: Hash[user,month] = SHA256(Salt[user,month] + id_user)
    """
    if(id_user=="DEL"):
        return id_user
    else:
        hashme.counter+=1
        percent=(hashme.counter/307054)*100 
        sys.stdout.write("\rProgress %i -- Count : %i / 307054" % (percent,hashme.counter))
        sys.stdout.flush()
        return hs.sha256(salt_table.loc[[id_user],month][0] + id_user.encode()).hexdigest()
#Static Counter for ProgressBar
hashme.counter=0

In [None]:
%%time
def doHash():
    """
        Apply hashme for all rows
    """
    dfn_copy.sort_values(["id_user","month"])
    salt_table.sort_index(inplace=True)
    dfn["id_user"]=dfn_copy.apply(lambda x: hashme(x["id_user"],x["month"]), axis=1)
doHash()
hash_dfn=dfn.copy()

In [None]:
%%time
def hour_to_int(hours):
    if(isinstance(hours,float)):
        return int(hours)
def parse_dates(date):
    if(isinstance(date,dt.date) and not pd.isnull(date)):
        return date.strftime('%Y/%m/%d')
dfn["qty"]=df["qty"].astype("int")
dfn["price"]=df["price"].round(2)
dfn["date"]=dfn.apply(lambda x: parse_dates(x["date"]),axis=1)
dfn["hours"]=dfn.apply(lambda x: hour_to_int(x["hours"]), axis=1)
dfn.drop(["month","year"],axis=1,inplace=True)
dfn.sort_values(["date"],inplace=True)
dfn.to_csv("./atx1.csv",index=False)
dfa=pd.read_csv("./atx1.csv")
dfa

In [None]:
%%time
ground_truth, submission = round1_preprocessing(
    "./ground_truth.csv", "./atx1.csv"
)
check_format_trans_file(ground_truth, submission)
metric = Metrics(ground_truth, submission)
scores = metric.scores()