In [1]:
import pandas as pd
import numpy as np
import copy as cp
import darc_core
import sys
import hashlib as hs
import base64 as b64
import os
import random as rd
import datetime as dt
#from darc_core.metrics import Metrics
from darc_core.utils import M_COL, T_COL, T_COL_IT, NB_GUESS, SIZE_POOL
from darc_core.preprocessing import round1_preprocessing
from darc_core.utils import check_format_trans_file

In [5]:
%%time
df = pd.read_csv("./ground_truth.csv", parse_dates=["date"])
df["hours"]= pd.to_datetime(df["hours"], format = '%H:%M%S').dt.hour
df["month"]=df["date"].dt.month
df["year"]=df["date"].dt.year
df["id_user"]=df["id_user"].astype(str)
df["price"]=df["price"].astype("float")
df["qty"]=df["qty"].astype("float")
df_dict = df.T.to_dict('list')
df.head()

CPU times: user 23.2 s, sys: 228 ms, total: 23.4 s
Wall time: 23.5 s


In [3]:
df.head()

Unnamed: 0,id_user,date,hours,id_item,price,qty,month,year
0,17850,2010-12-01,8,85123A,2.55,6.0,12,2010
1,17850,2010-12-01,8,71053,3.39,6.0,12,2010
2,17850,2010-12-01,8,84406B,2.75,8.0,12,2010
3,17850,2010-12-01,8,84029G,3.39,6.0,12,2010
4,17850,2010-12-01,8,84029E,3.39,6.0,12,2010


In [6]:
%%time
def mergeIdentical(df,df_dict):
    """ Merge "same" transaction into one line. Assign "DEL" to duplicates
        Transactions are "identical" if their keys are equal
        
        :map: key:[id_user,date,id_item] value:[qty_cumulative,count]
    """
    map={}
    for d in range(0,len(df)):
        if(map.get((df["id_user"][d],df["date"][d],df["id_item"][d]))):
            map[(df["id_user"][d],df["date"][d],df["id_item"][d])][0] += df_dict[d][5]
            map[(df["id_user"][d],df["date"][d],df["id_item"][d])][1] += 1
            df_dict[d][0]="DEL"
            df_dict[d][1]=df_dict[d][1].strftime("%Y-%m-01")
            df_dict[d][2]=""
            df_dict[d][3]=""
            df_dict[d][4]=""
            df_dict[d][5]=""
        else:
            map[(df["id_user"][d],df["date"][d],df["id_item"][d])]=[df.loc[d]["qty"],1]
    
    dfn = pd.DataFrame.from_dict(df_dict,orient="index")
    dfn = dfn.rename(columns={0:"id_user",1:"date",2:"hours",3:"id_item",4:"price",5:"qty",6:"month",7:"year"})
    
    for i in map:
        if(map[i][1]>1):
            #map[i][0]=map[i][0]/map[i][1]
            mask= (df["id_user"]==i[0]) & (df["date"]==i[1]) & (df["id_item"]==i[2])
            dfn.loc[mask,["qty"]]=map[i][0]
    
  
    return dfn,map

dfn,map=mergeIdentical(df,df_dict)
save_dfn_merge=dfn.copy()

CPU times: user 12min 25s, sys: 4.8 s, total: 12min 30s
Wall time: 12min 44s


In [7]:
def split(df, partition, column): 
    """
        Split df[partition] over column on median 
        :return: List[[IndexRange]]*2 {left ,right}
    """
    dfp = df["id_user"][partition]
    sanitizedPartition = dfp.index[dfp != "DEL"]
    dfp = df[column][sanitizedPartition]
    if (column in categorical):
        values = dfp.unique()
        dfp.sort_values(inplace=True)
        lv=set(values[:len(values)//2])
        rv=set(values[len(values)//2:])
        return dfp.index[dfp.isin(lv)],dfp.index[dfp.isin(rv)]
    else:
        median=dfp.median()
        dfp.sort_values(inplace=True)
        dfl=dfp.index[dfp<median]
        dfr=dfp.index[dfp>=median]
        return(dfl,dfr)

In [8]:
def is_k_anonymous (df, partition, k=1):
    if ((len(partition)<k)):
        return False
    return True

In [11]:
%%time
dfn["id_item"]=dfn["id_item"].astype("category")
categorical = set ({'id_item'})
def partition_dataset(df, is_valid):
    """
        Greedy search
        Split dataframe over {year,month,id_item} 
        :return: List[[IndexRange]] 
    """
    finished_partitions=[]
    partitions=[df.index]
    while partitions:    
        partition = partitions.pop(0)
        columns = ["year","month", "id_item"]
        for column in columns:
            lp, rp = split(df,partition,column)
            if not is_valid(df,lp) or not is_valid(df,rp):
                continue
            partitions.extend((lp,rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions
finished_partitions = partition_dataset(dfn, is_k_anonymous)
save_finished_partitions=finished_partitions.copy()

CPU times: user 13min 46s, sys: 9.42 s, total: 13min 55s
Wall time: 14min 36s


In [12]:
%%time
dfn["id_item"]=dfn["id_item"].astype("category")
categorical = set ({'id_item'})
finished_partitions=save_finished_partitions
def make_l_diverse(df,partitions,l=3):
    """
         DEL partitions with l-diversity < l
    """
    i=0
    filtered=[]
    while(len(partitions)):
        if (df.loc[partitions[0],"id_user"].nunique()<l):
            df.loc[partitions[0],"id_user"]="DEL"
            if(isinstance(df.loc[partitions[0],["date"]].squeeze(),dt.date)):
                df.loc[partitions[0],["date"]]=df.loc[partitions[0],["date"]].squeeze().strftime('%Y-%m-01')
            else:
                df.loc[partitions[0],["date"]]=df.loc[partitions[0],["date"]].squeeze().apply(lambda x: x.strftime('%Y-%m-01'))
            df.loc[partitions[0],["hours","id_item"]]="" 
            partitions.pop(0)           
        else:
            filtered.append(partitions.pop(0))       
    return dfn, filtered
dfn, finished_partitions = make_l_diverse(dfn,finished_partitions)
save_diversity=finished_partitions.copy()

CPU times: user 13min 29s, sys: 18.3 s, total: 13min 47s
Wall time: 14min 5s


In [13]:
def shuffleDates(df,partition):
    """
         For same partition, swap {dates,qty} randomly.
    """
    date_array=df.loc[partition,"date"].copy().squeeze()
    qty_array=df.loc[partition,"qty"].copy().squeeze()
    date_array.reset_index(drop=True,inplace=True) 
    qty_array.reset_index(drop=True,inplace=True)
    date_shuffled=[]
    qty_shuffled=[]
    while(len(date_array)):
        i=rd.randrange(len(date_array))
        date_shuffled.append(date_array[i])
        date_array.drop(i,inplace=True)
        date_array.reset_index(drop=True,inplace=True)
        qty_shuffled.append(qty_array[i])
        qty_array.drop(i,inplace=True)
        qty_array.reset_index(drop=True,inplace=True)
    df.loc[partition,"date"]=date_shuffled
    df.loc[partition,"qty"]=qty_shuffled
    return df

In [14]:
%%time
def build_anonymized_dataset(df,partitions):
    """
        Generalization of Columns {dates,qty,hours}
    """
    for p in partitions:
        df.loc[p,["hours"]]=df.loc[p,["hours"]].mean().values[0]
        df=shuffleDates(df,p)
    return df
dfn=build_anonymized_dataset(dfn,finished_partitions)
save_dfn=dfn.copy()

CPU times: user 24min 45s, sys: 37.6 s, total: 25min 23s
Wall time: 26min 1s


In [15]:
%%time
dfn.id_user=dfn.id_user.astype(str)
dfn_copy=dfn.copy()

def generateSalts():
    """ 
        Generate Salt{256b} Table : Mij = Salt for user i in month j 
    """
    # Set of {1..12} (number of the month)
    ordered_months = list(range(1,13))
    # Set of all unique user's IDs
    ids = dfn_copy["id_user"].unique()
    ids = ids[ids != "DEL"]
    # Generate salt table
    salt_table=pd.DataFrame(columns=ordered_months,index=ids)
    salt_table.set_index(ids,inplace=True)
    for i in ids:
        for j in ordered_months:
            # Generates unique b64 values
            salt_table[j][i]=os.urandom(256)
    return salt_table

salt_table = generateSalts()
salt_table.head()

CPU times: user 6.88 s, sys: 632 ms, total: 7.52 s
Wall time: 7.72 s


In [16]:
def hashme(id_user, month): 
    """
        :return: Hash[user,month] = SHA256(Salt[user,month] + id_user)
    """
    if(id_user=="DEL"):
        return id_user
    else:
        hashme.counter+=1
        percent=(hashme.counter/307054)*100 
        sys.stdout.write("\rProgress %i -- Count : %i / 307054" % (percent,hashme.counter))
        sys.stdout.flush()
        return hs.sha256(salt_table.loc[[id_user],month][0] + id_user.encode()).hexdigest()
#Static Counter for ProgressBar
hashme.counter=0

In [17]:
%%time
def doHash():
    """
        Apply hashme for all rows
    """
    dfn_copy.sort_values(["id_user","month"])
    salt_table.sort_index(inplace=True)
    dfn["id_user"]=dfn_copy.apply(lambda x: hashme(x["id_user"],x["month"]), axis=1)
doHash()
hash_dfn=dfn.copy()

Progress 93 -- Count : 285896 / 307054CPU times: user 16min 40s, sys: 2min, total: 18min 40s
Wall time: 23min 1s


In [30]:
%%time
def hour_to_int(hours):
    if(isinstance(hours,float)):
        return int(hours)
def parse_dates(date):
    if(isinstance(date,dt.date) and not pd.isnull(date)):
        return date.strftime('%Y/%m/%d')
dfn["qty"]=df["qty"].astype("int")
dfn["price"]=df["price"].round(2)
dfn["date"]=dfn["date"].apply(lambda x: x.strftime('%Y/%m/%d'))
dfn["hours"]=dfn.apply(lambda x: hour_to_int(x["hours"]), axis=1)
dfn.drop(["month","year"],axis=1,inplace=True)
dfn.sort_values(["date"],inplace=True)
dfn.to_csv("./atxtest.csv",index=False)
dfa=pd.read_csv("./atxtest.csv")
dfa

CPU times: user 31.2 s, sys: 414 ms, total: 31.6 s
Wall time: 32.4 s


In [24]:
%%time
ground_truth, submission = round1_preprocessing(
    "./ground_truth.csv", "./atx5.csv"
)
check_format_trans_file(ground_truth, submission)

CPU times: user 14 s, sys: 389 ms, total: 14.4 s
Wall time: 14.6 s


In [31]:
%%time
metric = Metrics(ground_truth, submission)
scores = metric.scores_reid()
scores



CPU times: user 30min 16s, sys: 27.5 s, total: 30min 43s
Wall time: 1h 20s


In [32]:
scores

[0.001156, 0.000267, 0.000178, 0.002046, 0.000178, 0.002224, 0.003439]

In [77]:
scores = metric.scores_util()
scores

[0.8568029966868055,
 0,
 0.7850147308995272,
 0.0,
 0.431553882,
 0.2057,
 0.028296,
 0.000177,
 8.8e-05,
 0.014767,
 8.8e-05,
 0.01291,
 0.00336]

In [4]:
dfn=pd.read_csv("./atx1.csv", parse_dates=["date"])

In [9]:
dfn[dfn["id_user"]=="DEL"]["qty"].value_counts

<bound method IndexOpsMixin.value_counts of 1         24
2         24
4         24
15         4
19         2
27        12
29        48
31        48
33        12
43        12
44        12
50        12
51         1
53         1
57         1
58         1
59         2
60         2
63        12
64         6
67         2
70         1
76        25
77        25
83        25
85        24
92         8
94         1
96         1
97         1
          ..
306763    25
306812    12
306813    12
306815     6
306829     2
306836     4
306846    12
306861    12
306864    12
306867    24
306920    12
306924    24
306947    48
306949    12
306950     4
306951    12
306953    12
306955    12
306957    96
306971     8
306972    12
306973    24
306974    12
306976    12
306993    15
306994    15
307026    20
307035    12
307043    12
307049    12
Name: qty, Length: 63166, dtype: int64>

In [10]:
mask=dfn["id_user"]=="DEL"
dfn.loc[mask,"date"]=""

In [22]:
dfn.loc[mask,["qty","price","hours","id_item"]]=""

In [80]:
def parse_dates(date):
    if(isinstance(date,dt.date)):
        return date.strftime('%Y/%m/%d')
dfn["date"]=dfn.apply(lambda x: parse_dates(x["date"]), axis=1)

In [40]:
dfn.to_csv("./atx2.csv",index=False)

In [87]:
dfn=hash_dfn.copy()

In [29]:
dfn["date"]=dfn["date"].astype("datetime64[ns]")

In [95]:
dfn[dfn["id_user"]=="DEL"]["date"].value_counts()

2011-11-01    3017
2011-10-01    2455
2010-12-01    2154
2011-09-01    1532
2011-05-01    1466
2011-03-01    1441
2011-06-01    1437
2011-08-01    1412
2011-01-01    1384
2011-04-01    1359
2011-02-01    1357
2011-07-01    1352
2011-12-01     792
Name: date, dtype: int64

In [96]:
dfn["date"]

0        2010-12-16
1        2010-12-07
2        2010-12-19
3        2010-12-07
4        2010-12-08
5        2010-12-01
6        2010-12-06
7        2010-12-05
8        2010-12-06
9        2010-12-06
10       2010-12-02
11       2010-12-14
12       2010-12-08
13       2010-12-06
14       2010-12-03
15       2010-12-10
16       2010-12-16
17       2010-12-14
18       2010-12-01
19       2010-12-01
20       2010-12-14
21       2010-12-01
22       2010-12-05
23       2010-12-15
24       2010-12-13
25       2010-12-21
26       2010-12-19
27       2010-12-05
28       2010-12-09
29       2010-12-01
            ...    
307024   2011-12-08
307025   2011-12-06
307026   2011-12-09
307027   2011-11-13
307028   2011-11-17
307029   2011-11-14
307030   2011-11-18
307031   2011-12-02
307032   2011-11-20
307033   2011-11-13
307034   2011-11-28
307035   2011-12-05
307036   2011-11-23
307037   2011-11-11
307038   2011-11-30
307039   2011-11-21
307040   2011-11-06
307041   2011-11-29
307042   2011-11-21


In [22]:
"""
File: metrics.py
Author: Antoine Laurent
Email: laurent.antoine@courrier.uqam.ca
Github: https://github.com/Drayer34
Description: class for all re-identification metrics
"""

import sys
from collections import OrderedDict
import time
import math
from multiprocessing import Pool
from pathos.multiprocessing import ProcessingPool as PPool
from functools import partial

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix

class Metrics(object):

    """Super class Metrics for ReidentificationMetrics and UtilityMetrics. It genreate the S data
    from AT one.

    Attributes are :
        :_users: M table containing all users present in the transaction data T (pandas DataFrame).
        :_ground_truth: T table containing all transaction of all user for one year (pandas DataFrame).
        :_anonymized: S table, the anonymized version of the _ground_truth (pandas DataFrame).
        :_users_t_col: the name of the columns in the csv file M.
        :_gt_t_col: the name of the columns in the csv file T.
        :_current_score: current score calculated by the metric already processed.
    """

    def __init__(self, T, AT, M_col=M_COL, T_col=T_COL, T_col_it=T_COL_IT):
        """
        :_users: M table containing all users present in the transaction data T (pandas DataFrame).
        :_ground_truth: T table containing all transaction of all user for one year (pandas DataFrame).
        :_anonymized: S table, the anonymized version of the _ground_truth (pandas DataFrame).
        :_users_t_col: the name of the columns in the csv file M.
        :_gt_t_col: the name of the columns in the csv file T.
        :_gt_t_col: the name of the columns in the csv file T for iteration.
        :_current_score: current score calculated by the metric already processed.
        """
        self._ground_truth = T
        self._users = pd.DataFrame(self._ground_truth.id_user.drop_duplicates().sort_values(), columns=["id_user"]).reset_index(drop=True)
        self._anon_trans = AT
        self._users_t_col = M_col
        self._gt_t_col = T_col
        self._gt_t_col_it = T_col_it
        self._current_score = []

        #  TODO: Cast all data into a fonction  <07-12-18, yourname> #
        self._ground_truth[self._gt_t_col['id_user']] = self._ground_truth[self._gt_t_col['id_user']].apply(str)
        self._anon_trans[self._gt_t_col['id_user']] = self._anon_trans[self._gt_t_col['id_user']].apply(str)

        self._anonymized = self.generate_S_data()

        self._f_orig = self.f_orig()


    def scores_util(self):
        return [self._e1_metric(), self._e2_metric(), self._e3_metric(), self._e4_metric(), self._e5_metric(), self._e6_metric()]

    def scores_reid(self):
        return [self._s1_metric(), self._s2_metric(), self._s3_metric(), self._s4_metric(), self._s5_metric(), self._s6_metric(), self._s7_metric()]

    def scores(self):
        return self.scores_util()

    def generate_S_data(self):
        """Generate S data from AT data.
        :returns: S

        """
        data = self._anon_trans.copy()
        # Remove NaN value from DataFrame
        data = data.dropna()
        # Remove 'DEL' row in DataFrame
        data = data[data[self._gt_t_col['id_user']] != "DEL"]
        #  TODO:check si il y a une seed pour l'aléa  <30-05-18, yourname> #
        data = data.sample(frac=1).reset_index(drop=True)

        return data

    def month_passed(self, date):
        """ Get the month from a date, month should be between 0 and 11

        :date: a date in format YYYY/MM/DD
        :return: integer between 0 and 11 """
        return 0 if date.split('/')[0] == '2010' else int(date.split('/')[1])

    def f_orig(self):
        """Generate the F file for the original data, to compare it with the F^ file.

        :returns: F file original

        """

        # Initialization
        f_orig = pd.DataFrame(columns=['id_user', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
        f_orig.id_user = self._ground_truth[self._gt_t_col['id_user']].value_counts().index
        f_orig = f_orig.sort_values('id_user').reset_index(drop=True)
        f_orig.loc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] = "DEL"


        seen = set()
        for row in self._ground_truth.itertuples():
            id_orig = row[self._gt_t_col_it['id_user']]
            month = self.month_passed(row[self._gt_t_col_it['date']])
            id_ano = self._anon_trans.loc[row[0], self._gt_t_col['id_user']]
            item = "{}-{}-{}".format(id_orig, month, id_ano)
            if item not in seen and id_ano != "DEL":
                seen.add(item)
                f_orig.loc[f_orig.id_user == id_orig, month] = id_ano

        return f_orig

    def compare_f_files(self, f_hat):
        """Compare the two F files to compute the difference and thus the score

        :f: the original f file to compare (pandas DataFrame)
        :f_hat: the guessed f file computed by the metric or adversary (pandas DataFrame)

        :returns: score
        """

        #tps1 = time.clock()
        map_error = 0
        score = 0
        count = 0
        total = 0

        #we want the same list of users
        if set(self._f_orig['id_user']).difference(set(f_hat['id_user'])):
            map_error = 1

        if map_error == 0:
            for row in self._f_orig.itertuples():
                # Compare each tuple, if they are egual over all month then gain 12 points
                # One points per similarities
                f_ori_tuple = row[1:]
                f_hat_tuple = tuple(f_hat[f_hat['id_user'] == row[1]].iloc[0])
                for i in range(1, 13):
                    if f_ori_tuple[i] != "DEL":
                        total += 1
                        count += self._compare_row_f_file(f_ori_tuple[i], f_hat_tuple[i])
        #            if f_ori_tuple[i] == f_hat_tuple[i] and f_ori_tuple[i] != "DEL":
        #                count += 1


        if map_error == 0:
            score += round(float(count)/float(total), 6)

        return score

    def _compare_row_f_file(self, row_orig, row_sub):
        """
        Check if row_orig is a substring of row_sub, i.e did the participant submit
        the good id among all id submitted
        """
        row_sub = row_sub.split(':')
        count = 0
        for i in range(len(row_sub)):
            if row_orig in row_sub[i]:
                count = (NB_GUESS - i)/NB_GUESS
        return count

    def _gen_value_id_dic(self, attrs):
        """Generate the dictionaty which associate the value of the attributes attrs in the
        DataFrame to an user ID.

        :attrs: the list of attibutes to check for creating the value
        :return: dictionary of value:id
        """
        value_dic = {}
        for row in self._anonymized.itertuples():
            # Create the value with all the row[attrs] concat with ":"
            value = ':'.join([str(row[elt]) for elt in attrs])
            value_dic[value] = row[self._gt_t_col_it['id_user']]
        return value_dic

    def _guess_inialisation(self):
        """Generate a virgin F^ file with DEL on each column for each id

        :return: the dictionary of id:pseudos
        """
        guess = OrderedDict()
        for row in self._users.itertuples():
            # Fill dic[id] with DEL
            guess[str(row[self._users_t_col['id_user']])] = ['DEL' for i in range(13)]
        return guess

    def _tronc_product_id(self, num):
        """ Tronc the product ID to the number num.
            Example :
                id_prod = ABCDEF and num = 2
                result = AB

        :num: the number of characters to keep.
        """
        col_id_item = self._gt_t_col['id_item']
        self._anonymized.loc[:, col_id_item] = self._anonymized.loc[:, col_id_item]\
                                            .apply(lambda s: s[:min(len(s), num)])
        self._ground_truth.loc[:, col_id_item] = self._ground_truth.loc[:, col_id_item]\
                                            .apply(lambda s: s[:min(len(s), num)])

    def _evaluate(self, attrs):
        """ Evaluate the similtude between T and S on attributs attrs.

        :attrs: attributes to check.
        :return: F^ the guess of Pseudo for each user and each month.
        """

        dic_value_anon = self._gen_value_id_dic(attrs)
        guess = self._guess_inialisation()
        for row in self._ground_truth.itertuples():
            #create the concat of the attributes to watch
            value = ':'.join([str(row[i]) for i in attrs])
            id_user = row[self._gt_t_col_it['id_user']]
            if value in dic_value_anon.keys():
                #recover month of the transaction
                month = self.month_passed(row[self._gt_t_col_it['date']])
                #affect pseudo for id_user where value == value
                guess[id_user][month] = dic_value_anon[value]

        f_hat = pd.DataFrame(guess).transpose()
        f_hat = f_hat.reset_index()
        f_hat = f_hat.rename(columns={'index':'id_user'})

        return f_hat

    def _subset(self, month):
        """
        TODO
        """
        subset1 = time.clock()
        # On définit le masque en fonction du mois, beacoup de cas -> le code est dégeu
        if month != 0 :
            if month < 9:
                start = '2011-0'+str(month)+'-01'
                end = '2011-0'+str(month+1) +'-01'
            if month == 9:
                start = '2011-0'+str(month)+'-01'
                end = '2011-10-01'
            if month == 12:
                start = '2011-'+str(month)+'-01'
                end = '2012-01-01'
            else:
                start = '2011-'+str(month)+'-01'
                end = '2011-'+str(month+1) +'-01'
        else:
            start = '2010-12-01'
            end = '2010-12-31'
        self._ground_truth['date'] = pd.to_datetime(self._ground_truth['date'])
        mask1 = (self._ground_truth['date'] >= start) & (self._ground_truth['date'] < end)
        self._anonymized['date'] = pd.to_datetime(self._anonymized['date'])
        mask2 = (self._anonymized['date'] >= start) & (self._anonymized['date'] < end)

        t_month=self._ground_truth.loc[mask1]
        a_month=self._anonymized.loc[mask2]

        subset2 = time.clock()
        return t_month, a_month, month

    @staticmethod
    def _compute_score(a, t):
        """
        TODO : Find a way to put it in a lambda
        """
        return len(a.intersection(t))/len(t)

    def _find_k_guess(self, t_month, a_month, month):
        """
        TODO
        """
        #k_guess1 = time.clock()

        #Fabrication du vecteur d'item pour chaque utilisateur
        t_month_group=t_month.groupby('id_user')['id_item'].apply(set)
        a_month_group=a_month.groupby('id_user')['id_item'].apply(set)

        # Début du calculs des coefficients de similiarité.
        d_month=dict()
        d_month['month'] = month

        for id_t in t_month_group.index:
            list_guess = list()
            guess = str()
            inter_a_t = a_month_group.apply( lambda x : self._compute_score(x, t_month_group.loc[id_t]))
            top_guess = inter_a_t.nlargest(NB_GUESS)
            for index in top_guess.index :
                 guess += str(index) + ':'
            d_month[id_t] = guess[:-1]
        #k_guess2 = time.clock()
        return d_month

    def _s1_metric(self):
        """Calculate metric S1, comparing date and quantity buy on each row.
        Update the current score value

        :returns: the score on this metric (between 0 and 1)

        """
        date_col = self._gt_t_col_it['date']
        qty_col = self._gt_t_col_it['qty']

        f_hat = self._evaluate([date_col, qty_col])

        score = self.compare_f_files(f_hat)
        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _s2_metric(self):
        """Calculate metric S2, comparing date and quantity buy on each row.

        :returns: the score on this metric (between 0 and 1)

        """
        id_item_col = self._gt_t_col_it['id_item']
        price_col = self._gt_t_col_it['price']

        f_hat = self._evaluate([id_item_col, price_col])

        score = self.compare_f_files(f_hat)
        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _s3_metric(self):
        """Calculate metric S3, comparing date and quantity buy on each row.

        :returns: the score on this metric (between 0 and 1)

        """
        id_item_col = self._gt_t_col_it['id_item']
        qty_col = self._gt_t_col_it['qty']

        f_hat = self._evaluate([id_item_col, qty_col])

        score = self.compare_f_files(f_hat)
        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _s4_metric(self):
        """Calculate metric S4, comparing date and quantity buy on each row.

        :returns: the score on this metric (between 0 and 1)

        """
        date_col = self._gt_t_col_it['date']
        id_item_col = self._gt_t_col_it['id_item']

        f_hat = self._evaluate([date_col, id_item_col])

        score = self.compare_f_files(f_hat)
        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _s5_metric(self):
        """Calculate metric S5, comparing date and quantity buy on each row.

        :returns: the score on this metric (between 0 and 1)

        """
        id_item_col = self._gt_t_col_it['id_item']
        price_col = self._gt_t_col_it['price']
        qty_col = self._gt_t_col_it['qty']

        f_hat = self._evaluate([id_item_col, price_col, qty_col])

        score = self.compare_f_files(f_hat)
        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _s6_metric(self):
        """Calculate metric S6, comparing date and quantity buy on each row.

        :returns: the score on this metric (between 0 and 1)

        """
        id_item_col = self._gt_t_col_it['id_item']
        date_col = self._gt_t_col_it['date']
        price_col = self._gt_t_col_it['price']

        f_hat = self._evaluate([id_item_col, date_col, price_col])

        score = self.compare_f_files(f_hat)
        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _s7_metric(self):
        """
        TODO
        """
        def _reid_multi(month):
            return self._find_k_guess(*self._subset(month))

        Df_list = list()
        F_list = list()
        with PPool(SIZE_POOL) as p:
            F_list = p.map( _reid_multi, [i for i in range(13)])
        F_list = sorted(F_list, key=lambda x : x['month'], reverse = False)
        dtypes = {'id_user': str}
        user_id = pd.DataFrame(sorted(self._ground_truth["id_user"].unique()))
        user_id.columns = ['id_user']
        user_id = user_id.set_index('id_user')
        Df_list.append(user_id)
        for dict_month in F_list:
            col = dict_month.pop("month")
            df = pd.DataFrame.from_dict(dict_month, orient='index')
            df.columns = [col]
            Df_list.append(df)
        F_file = pd.concat(Df_list, axis=1, join_axes=[Df_list[0].index])
        F_file = F_file.reset_index()
        F_file = F_file.fillna('DEL')

        score = self.compare_f_files(F_file)
        self._current_score.append(score)

        return score

    def _calc_sim_mat_dist(self, item_item_dic1, item_item_dic2):
        """ Calcul the distance between two item_item matrix.

        return: the distance between the matrix, max value is 1
        """
        sim_dist = 0
        item_item_dic1_sum = 0

        for item_no, item2_no in item_item_dic1:
            item_item_dic1_sum += item_item_dic1[(item_no, item2_no)]

            if (item_no, item2_no) in item_item_dic2:
                sim_dist += math.fabs(item_item_dic1[(item_no, item2_no)] - item_item_dic2[(item_no, item2_no)])
            else:
                sim_dist += item_item_dic1[(item_no, item2_no)]

        sim_dist /= item_item_dic1_sum

        if sim_dist > 1.0:
            sim_dist = 1.0
        return sim_dist

    def _compare_date_gt_anon(self):
        """Compare each date in T and AT, it's mean to value the transformation applied to the date
        in AT.

        :returns: The total of all the difference between the date in T and AT

        """
        score = 0

        for idx in range(self._anon_trans.shape[0]):
            # Skip this loop if id_user == DEL
            if self._anon_trans.iloc[idx, self._gt_t_col_it['id_user']-1] == "DEL":
                continue

            # Get the date from the data at index idx
            #  TODO: .iloc should be used here for safety reason but iloc does not keep columns
            #  value <12-06-18, Antoine> #
            gt_day = self._ground_truth.loc[idx, self._gt_t_col['date']]
            anon_day = self._anon_trans.loc[idx, self._gt_t_col['date']]

            #  TODO: Mettre ça dans un fichier qui test tous les formats et fait des messages
            #  d'erreurs approprié  <12-06-18, Antoine> #
            gt_day = pd.datetime.strptime(gt_day, '%Y/%m/%d')
            anon_day = pd.datetime.strptime(anon_day, '%Y/%m/%d')

            score += abs((gt_day - anon_day).days)

        score = np.round(float(score)/float(31 * self._anon_trans.shape[0]), 10)

        return score

    def _compare_price_gt_anon(self):
        """Compare each price in T and AT, it's mean to value the transformation applied to the
        price in AT.

        :returns: The total of all the difference between the price in T and AT

        """
        score = 0

        for idx in range(self._anon_trans.shape[0]):
            # Skip this loop if id_user == DEL
            if self._anon_trans.iloc[idx, self._gt_t_col_it['id_user']-1] == "DEL":
                continue

            # Get the date from the data at index idx
            gt_price = float(self._ground_truth.loc[idx, self._gt_t_col['price']])
            anon_price = float(self._anon_trans.loc[idx, self._gt_t_col['price']])

            # Get the difference between date1 and date2 in days
            score += (1 - min(gt_price, anon_price)/max(gt_price, anon_price))

        score = np.round(float(score)/float(self._anon_trans.shape[0]), 10)

        return score

    def _compute_median_qty(self):
        """Compute the median of all qty of item buyed

        :return: median of all qty of item buyed
        """
        # Creating item x user sparse matrix for the ground_truth
        gt_id_item_id_user = self.ground_truth.groupby(
            [self._gt_t_col['id_item'], self._gt_t_col['id_user']]
        )[self._gt_t_col['qty']].sum()

        # Recovering median
        median = gt_id_item_id_user.median()

        return median

    def _collaborative_filtering_item_user(self, data, e2=False):
        """Compute the matrix of cosine similarity between all items

        :data: dataframe from which to compute the collaborative_filtering
        :returns: matrix |col_item|x|col_item|

        """

        id_items_ori = self.ground_truth[self.gt_t_col["id_item"]].unique()

        # Creating item x user sparse matrix for the ground_truth
        data_id_item_id_user = data.groupby(
            [self._gt_t_col['id_item'], self._gt_t_col['id_user']]
        )[self._gt_t_col['qty']].sum()

        if e2:
            # item x user < median
            data_id_item_id_user = data_id_item_id_user[
                data_id_item_id_user < self._compute_median_qty()
            ]

        # Create dataframe matrix
        data_id_item_id_user = data_id_item_id_user.unstack(level=1).to_sparse()

        items_differ = set(id_items_ori) - set(data_id_item_id_user.index)

        if items_differ:
            df_temp = pd.DataFrame(index=items_differ, columns=data_id_item_id_user.columns)
            data_id_item_id_user = data_id_item_id_user.append(df_temp)

        data_id_item_id_user = data_id_item_id_user.sort_index()

        return cosine_similarity(data_id_item_id_user.fillna(0))

    def _e1_metric(self):
        """ Construct a similarity matrix of item buyed (User that have bought this item also bought
        item_i). Here the score is maximized if the quantity is high (calculated by dozen). We
        calculate the difference between the two matrix of item buyed as a score.

        More precisly we construct two matrix M1 and M2, one for the original dataset and one for
        the anonymised one. Both are of size `n x n` where `n` is the number of item. For M_ij
        represent the number of people who have bought the item i and have also bought the item j.

        This procede is called a collaborative filtering
        (https://en.wikipedia.org/wiki/Collaborative_filtering).

        :returns: score of the metric.

        """

        # Copy dataframe
        ground_truth = self._ground_truth.copy()
        anon_trans = self._anon_trans.copy()
        anon_trans = anon_trans.drop(
            anon_trans[anon_trans[self._gt_t_col['id_user']] == 'DEL'].index, axis=0
        ).reset_index(drop=True)

        #  TODO: can be done one time only  <21-12-18, yourname> #
        anon_trans[self._gt_t_col['qty']] = anon_trans[self._gt_t_col['qty']].apply(int)

        # Compute the cosinus similarity item x item
        gt_cos_sim = self._collaborative_filtering_item_user(ground_truth)

        # Compute the cosinus similarity item x item
        at_cos_sim = self._collaborative_filtering_item_user(anon_trans)

        # Compute score
        diff_cos = abs(gt_cos_sim - at_cos_sim)
        score = min(1, diff_cos.sum() / gt_cos_sim.sum())

        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _e2_metric(self):
        """ Construct a similarity matrix of item buyed (User that have bought this item also bought
        item_i). Here the score is maximized if the quantity is low (<= threshold). We
        calculate the difference between the two matrix of item buyed as a score.

        More precisly we construct two matrix M1 and M2, one for the original dataset and one for
        the anonymised one. Both are of size `n x n` where `n` is the number of item. For M_ij
        represent the number of people who have bought the item i and have also bought the item j.

        This procede is called a collaborative filtering
        (https://en.wikipedia.org/wiki/Collaborative_filtering).

        :returns: score of the metric.

        """

        if True:
            return 0

        # Copy dataframe
        ground_truth = self._ground_truth.copy()
        anon_trans = self._anon_trans.copy()
        anon_trans = anon_trans.drop(
            anon_trans[anon_trans[self._gt_t_col['id_user']] == 'DEL'].index, axis=0
        ).reset_index(drop=True)

        #  TODO: can be done one time only  <21-12-18, yourname> #
        anon_trans[self._gt_t_col['qty']] = anon_trans[self._gt_t_col['qty']].apply(int)

        # Compute the cosinus similarity item x item
        gt_cos_sim = self._collaborative_filtering_item_user(ground_truth, e2=True)

        # Compute the cosinus similarity item x item
        at_cos_sim = self._collaborative_filtering_item_user(anon_trans, e2=True)

        # Compute score
        diff_cos = abs(gt_cos_sim - at_cos_sim)
        score = min(1, diff_cos.sum() / gt_cos_sim.sum())

        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _e3_metric(self):
        """ Caluclate the difference (as in set difference) and similarity matrix between top-`k`
        items bought from ground truth and anonymised dataset.

        :returns: score of the metric.

        """

        # Copy dataframe
        ground_truth = self._ground_truth.copy()
        anon_trans = self._anon_trans.copy()
        anon_trans = anon_trans.drop(
            anon_trans[anon_trans[self._gt_t_col['id_user']] == 'DEL'].index, axis=0
        ).reset_index(drop=True)

        #  TODO: can be done one time only  <21-12-18, yourname> #
        anon_trans[self._gt_t_col['qty']] = anon_trans[self._gt_t_col['qty']].apply(int)

        # Computing top 5% of most purchased item by customer
        item_count = ground_truth.groupby(self._gt_t_col['id_item']).size()
        gt_top_k = list(
            item_count.sort_values(ascending=False).head(int(item_count.shape[0]*0.05)).index
        )

        # Ground Truth is now with top k items
        ground_truth = ground_truth.set_index(self._gt_t_col['id_item'])
        ground_truth = ground_truth.loc[gt_top_k]
        ground_truth = ground_truth.reset_index()

        # Compute the cosinus similarity item x item
        gt_cos_sim = self._collaborative_filtering_item_user(ground_truth)

        # Computing top 5% of most purchased item by customer
        item_count = anon_trans.groupby(self._gt_t_col['id_item']).size()
        at_top_k = list(
            item_count.sort_values(ascending=False).head(int(item_count.shape[0]*0.05)).index
        )

        # Anonymized File is now with top k items
        anon_trans = anon_trans.set_index(self._gt_t_col['id_item'])
        anon_trans = anon_trans.loc[at_top_k]
        anon_trans = anon_trans.reset_index()

        # Compute the cosinus similarity item x item
        at_cos_sim = self._collaborative_filtering_item_user(anon_trans)

        # Compute score
        diff_cos = min(1, (abs(gt_cos_sim - at_cos_sim).sum()) / gt_cos_sim.sum())
        # Jaccard distance
        diff_top_k = (
            len(set(gt_top_k).union(at_top_k)) - len(set(gt_top_k).intersection(set(at_top_k)))
        )/ len(set(gt_top_k).union(at_top_k))

        score = max(diff_cos, diff_top_k)

        # Add the score to the global score for this metric
        self._current_score.append(score)

        return score

    def _e4_metric(self):
        """ Calculate the mean distance in day between anonymised and ground truth
        transactions.

        :returns: score of the metric.

        """
        score = self._compare_date_gt_anon()
        self._current_score.append(score)

        return score

    def _e5_metric(self):
        """ Calculate the difference, as the ratio, of all item prices.

        :returns: score of the metric.

        """
        score = self._compare_price_gt_anon()
        self._current_score.append(score)

        return score

    def _e6_metric(self):
        """ Calculate the ratio between the number of lines removed in the anonymized table over the
        number of lines in the original dataset.

        :returns: score of the metric.

        """
        score = np.round(float(1 - self._anonymized.shape[0]/self._ground_truth.shape[0]), 4)
        self._current_score.append(score)

        return score

    @property
    def f(self):
        """
        Get the original file F.
        """
        return self._f_orig


    @property
    def users(self):
        """
        Get the users data
        """
        return self._users

    @property
    def ground_truth(self):
        """
        Get the ground_truth data
        """
        return self._ground_truth

    @property
    def anonymized(self):
        """
        Get the anonymized data
        """
        return self._anonymized

    @property
    def users_t_col(self):
        """
        Get the column used in the M csv
        """
        return self._users_t_col

    @property
    def gt_t_col(self):
        """
        Get the column used in the T csv
        """
        return self._gt_t_col

    @property
    def current_score(self):
        """
        Get the current score calculated by the metrics
        """
        return self._current_score

def metric_wrapper(metric, instance, numero):
    """Launch a metric in function of instance metric and the number of the later.

    :metric: a single char wich is 's' or 'e', respectivly for reid and utility metrics.
    :instance: the instance of a Metric class containing methods `metric`.
    :numero: the ieme method of the instance you want to call.

    :returns: Result of the metric method called.

    """
    method = "_{}{}_metric".format(metric, numero)
    return getattr(instance, method)()

def utility_metric(ground_truth, sub):
    """TODO: Docstring for utility_metric.

    :arg1: TODO
    :returns: TODO

    """
    # Initialize utility metrics
    metric = Metrics(ground_truth, sub)

    #Compute utility metrics as subprocesses
    metric_pool = Pool()
    utility_wrapper = partial(metric_wrapper, "e", metric)
    utility_scores = metric_pool.map(utility_wrapper, range(1, 7))

    #Compute reidentification metrics as subprocesses
    #metric_pool = Pool()
    #reid_wrapper = partial(metric_wrapper, "s", metric)
    #reid_scores = metric_pool.map(reid_wrapper, range(1, 8))

    return utility_scores
