In [21]:
import pandas as pd
import numpy as np
import copy as cp
import darc_core
import sys
import hashlib as hs
import base64 as b64
import os
import random as rd
import datetime as dt
from difflib import SequenceMatcher
from darc_core.metrics import Metrics
from darc_core.preprocessing import round1_preprocessing
from darc_core.utils import check_format_trans_file

In [23]:
%%time
df = pd.read_csv("./ground_truth.csv", parse_dates=["date"])
df["month"]=df["date"].dt.month
df["year"]=df["date"].dt.year

dx = pd.read_csv("./atxtest.csv", parse_dates=["date"])
dx["month"]=dx["date"].dt.month
dx["year"]=dx["date"].dt.year

CPU times: user 1.16 s, sys: 165 ms, total: 1.33 s
Wall time: 1.58 s


In [30]:
def partition(df,y,m,u):
    return df[(df["id_user"]==u) & (df["month"]==m) & (df["year"]==y)]["id_item"]

def extractListForMonth(df,y,m): 
    users = df[(df["month"]==m) & (df["year"]==y)]["id_user"].unique().tolist()
    months =  list(range(1,13))

    key = []
    hashed = []

    for u in users:
        key=[]
        partitioned = partition(df,y,m,u)
        key=partitioned.unique().tolist()
        hashed.append([u,key,len(key)])

    hashed=sorted(hashed, key = lambda x:x[2])
    return hashed



In [32]:
%%time
def generateLists(df):
    lists_all=[]
    lists_all.append(extractListForMonth(df,2010,12))
    for m in range(1,13):
        lists_all.append(extractListForMonth(df,2011,m))
    return lists_all

all_clear = generateLists(df)
all_anon = generateLists(dx)        

CPU times: user 9min 21s, sys: 9.43 s, total: 9min 31s
Wall time: 10min 24s


In [44]:
len(all_anon)

13

In [45]:
%%time
def makeMatches(all_clear,all_anon):

    solution = {}
    solution[0] = match_hash_to_user(all_clear[0], all_anon[0], 1)
    print("Iteration 1/13")
    
    for i in range(1,(len(all_anon))):
        solution[i] = match_hash_to_user(all_clear[i], all_anon[i], 1)
        print("\n Iteration ", i , "/13")
        
    return solution

solution = makeMatches(all_clear,all_anon)


 Iteration 0 
 
 
 
 {0: [[13748, [['13748', 1.0], ['hash1', 0], ['hash2', 0]]], [15100, [['15100', 1.0], ['hash1', 0], ['hash2', 0]]], [14045, [['14045', 1.0], ['hash1', 0], ['hash2', 0]]], [15823, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [13145, [['13145', 1.0], ['hash1', 0], ['hash2', 0]]], [17949, [['17949', 1.0], ['hash1', 0], ['hash2', 0]]], [15769, [['15769', 1.0], ['hash1', 0], ['hash2', 0]]], [16353, [['16353', 1.0], ['hash1', 0], ['hash2', 0]]], [14409, [['14409', 1.0], ['hash1', 0], ['hash2', 0]]], [17720, [['17720', 1.0], ['hash1', 0], ['hash2', 0]]], [17870, [['17870', 1.0], ['hash1', 0], ['hash2', 0]]], [14866, [['14866', 1.0], ['hash1', 0], ['hash2', 0]]], [13848, [['13848', 1.0], ['hash1', 0], ['hash2', 0]]], [12755, [['12755', 1.0], ['hash1', 0], ['hash2', 0]]], [15380, [['15380', 1.0], ['hash1', 0], ['hash2', 0]]], [12989, [['12989', 1.0], ['hash1', 0], ['hash2', 0]]], [14608, [['17720', 1.0], ['hash1', 0], ['hash2', 0]]], [16711, [['16711', 1.0], ['hash1', 0], [


 Iteration  1  
 
 
 
 [[17949, [['17949', 1.0], ['hash1', 0], ['hash2', 0]]], [13368, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [14047, [['14047', 1.0], ['hash1', 0], ['hash2', 0]]], [13750, [['13750', 1.0], ['hash1', 0], ['hash2', 0]]], [15100, [['15100', 1.0], ['hash1', 0], ['hash2', 0]]], [17381, [['17381', 1.0], ['hash1', 0], ['hash2', 0]]], [16554, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [16500, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [16593, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [12989, [['12989', 1.0], ['hash1', 0], ['hash2', 0]]], [16210, [['15797', 1.0], ['hash1', 0], ['hash2', 0]]], [15797, [['15797', 1.0], ['hash1', 0], ['hash2', 0]]], [16882, [['16882', 1.0], ['hash1', 0], ['hash2', 0]]], [13094, [['13094', 1.0], ['hash1', 0], ['hash2', 0]]], [12386, [['12386', 0.6666666666666666], ['16473', 0.2], ['13198', 0.04081632653061224]]], [12643, [['12643', 1.0], ['hash1', 0], ['hash2', 0]]], [16042, [['16042', 0.6666666666666666], ['16034', 0.2], ['13534', 0.


 Iteration  2  
 
 
 
 [[13784, [['13784', 1.0], ['hash1', 0], ['hash2', 0]]], [15107, [['15107', 1.0], ['hash1', 0], ['hash2', 0]]], [12559, [['12559', 1.0], ['hash1', 0], ['hash2', 0]]], [14460, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [15885, [['15885', 1.0], ['hash1', 0], ['hash2', 0]]], [16353, [['16353', 1.0], ['hash1', 0], ['hash2', 0]]], [13983, [['13983', 1.0], ['hash1', 0], ['hash2', 0]]], [16998, [['16998', 1.0], ['hash1', 0], ['hash2', 0]]], [17924, [['17924', 1.0], ['hash1', 0], ['hash2', 0]]], [13094, [['13094', 1.0], ['hash1', 0], ['hash2', 0]]], [14477, [['14477', 1.0], ['hash1', 0], ['hash2', 0]]], [13414, [['13414', 1.0], ['hash1', 0], ['hash2', 0]]], [12823, [['12823', 1.0], ['hash1', 0], ['hash2', 0]]], [12553, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [17361, [['17361', 1.0], ['hash1', 0], ['hash2', 0]]], [18087, [['18087', 1.0], ['hash1', 0], ['hash2', 0]]], [17949, [['17949', 1.0], ['hash1', 0], ['hash2', 0]]], [17867, [['hash1', 0], ['hash2', 0], ['hash3


 Iteration  3  
 
 
 
 [[13094, [['13094', 1.0], ['hash1', 0], ['hash2', 0]]], [12980, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [14473, [['14473', 1.0], ['hash1', 0], ['hash2', 0]]], [12753, [['12753', 1.0], ['hash1', 0], ['hash2', 0]]], [15206, [['15206', 1.0], ['hash1', 0], ['hash2', 0]]], [13953, [['13953', 1.0], ['hash1', 0], ['hash2', 0]]], [17194, [['17194', 1.0], ['hash1', 0], ['hash2', 0]]], [14083, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [12931, [['12931', 1.0], ['hash1', 0], ['hash2', 0]]], [17080, [['17080', 1.0], ['hash1', 0], ['hash2', 0]]], [17816, [['17816', 1.0], ['hash1', 0], ['hash2', 0]]], [13983, [['13983', 1.0], ['hash1', 0], ['hash2', 0]]], [13261, [['13261', 1.0], ['hash1', 0], ['hash2', 0]]], [15301, [['13261', 1.0], ['hash1', 0], ['hash2', 0]]], [13631, [['13631', 1.0], ['hash1', 0], ['hash2', 0]]], [15299, [['15299', 1.0], ['hash1', 0], ['hash2', 0]]], [12823, [['12823', 1.0], ['hash1', 0], ['hash2', 0]]], [17929, [['17929', 1.0], ['hash1', 0], ['has


 Iteration  4  
 
 
 
 [[17580, [['17580', 1.0], ['hash1', 0], ['hash2', 0]]], [14680, [['14680', 1.0], ['hash1', 0], ['hash2', 0]]], [18179, [['18179', 1.0], ['hash1', 0], ['hash2', 0]]], [12678, [['12678', 1.0], ['hash1', 0], ['hash2', 0]]], [13984, [['13984', 1.0], ['hash1', 0], ['hash2', 0]]], [12669, [['12669', 1.0], ['hash1', 0], ['hash2', 0]]], [15061, [['15061', 1.0], ['hash1', 0], ['hash2', 0]]], [17045, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [17809, [['17809', 1.0], ['hash1', 0], ['hash2', 0]]], [17268, [['17268', 1.0], ['hash1', 0], ['hash2', 0]]], [18008, [['18008', 1.0], ['hash1', 0], ['hash2', 0]]], [13199, [['13199', 1.0], ['hash1', 0], ['hash2', 0]]], [16652, [['15304', 1.0], ['hash1', 0], ['hash2', 0]]], [13220, [['13220', 1.0], ['hash1', 0], ['hash2', 0]]], [15304, [['15304', 1.0], ['hash1', 0], ['hash2', 0]]], [16163, [['16163', 1.0], ['hash1', 0], ['hash2', 0]]], [13223, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [16356, [['16356', 1.0], ['hash1', 0], ['has


 Iteration  5  
 
 
 
 [[16634, [['16634', 1.0], ['hash1', 0], ['hash2', 0]]], [13026, [['13026', 1.0], ['hash1', 0], ['hash2', 0]]], [15304, [['15304', 1.0], ['hash1', 0], ['hash2', 0]]], [16714, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [15759, [['15759', 1.0], ['hash1', 0], ['hash2', 0]]], [17940, [['17940', 1.0], ['hash1', 0], ['hash2', 0]]], [12888, [['12888', 1.0], ['hash1', 0], ['hash2', 0]]], [15797, [['15797', 1.0], ['hash1', 0], ['hash2', 0]]], [13692, [['13692', 1.0], ['hash1', 0], ['hash2', 0]]], [15381, [['15381', 1.0], ['hash1', 0], ['hash2', 0]]], [17388, [['17388', 1.0], ['hash1', 0], ['hash2', 0]]], [13953, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [16660, [['16660', 1.0], ['hash1', 0], ['hash2', 0]]], [17190, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [14931, [['14931', 1.0], ['hash1', 0], ['hash2', 0]]], [17457, [['17457', 1.0], ['hash1', 0], ['hash2', 0]]], [16998, [['16998', 1.0], ['hash1', 0], ['hash2', 0]]], [13824, [['13824', 1.0], ['hash1', 0], ['hash2


 Iteration  6  
 
 
 
 [[15658, [['15658', 1.0], ['hash1', 0], ['hash2', 0]]], [12779, [['13157', 1.0], ['hash1', 0], ['hash2', 0]]], [16654, [['13955', 0.5], ['14905', 0.5], ['13842', 0.5]]], [15852, [['15852', 1.0], ['hash1', 0], ['hash2', 0]]], [13220, [['16076', 1.0], ['hash1', 0], ['hash2', 0]]], [15809, [['16076', 1.0], ['hash1', 0], ['hash2', 0]]], [16832, [['16832', 1.0], ['hash1', 0], ['hash2', 0]]], [13162, [['13162', 1.0], ['hash1', 0], ['hash2', 0]]], [12594, [['12594', 1.0], ['hash1', 0], ['hash2', 0]]], [13900, [['13900', 1.0], ['hash1', 0], ['hash2', 0]]], [15098, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [15864, [['15864', 1.0], ['hash1', 0], ['hash2', 0]]], [14285, [['14285', 1.0], ['hash1', 0], ['hash2', 0]]], [17865, [['15673', 0.6666666666666666], ['16325', 0.5], ['13940', 0.5]]], [17809, [['17809', 1.0], ['hash1', 0], ['hash2', 0]]], [13953, [['13953', 1.0], ['hash1', 0], ['hash2', 0]]], [15299, [['15299', 1.0], ['hash1', 0], ['hash2', 0]]], [14560, [['14560', 


 Iteration  7  
 
 
 
 [[14338, [['14338', 1.0], ['hash1', 0], ['hash2', 0]]], [16498, [['16498', 0.4], ['12915', 0.2222222222222222], ['16762', 0.16666666666666666]]], [12980, [['12980', 1.0], ['hash1', 0], ['hash2', 0]]], [16146, [['18072', 0.5], ['13265', 0.3333333333333333], ['17716', 0.25]]], [15512, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [17723, [['17723', 1.0], ['hash1', 0], ['hash2', 0]]], [13003, [['13003', 1.0], ['hash1', 0], ['hash2', 0]]], [16998, [['16998', 1.0], ['hash1', 0], ['hash2', 0]]], [13509, [['13509', 1.0], ['hash1', 0], ['hash2', 0]]], [13564, [['13564', 1.0], ['hash1', 0], ['hash2', 0]]], [15298, [['15298', 0.3333333333333333], ['15785', 0.2222222222222222], ['14352', 0.2222222222222222]]], [13750, [['16825', 1.0], ['hash1', 0], ['hash2', 0]]], [13079, [['13079', 1.0], ['hash1', 0], ['hash2', 0]]], [16668, [['16668', 0.5], ['18061', 0.4], ['15713', 0.18181818181818182]]], [16696, [['17723', 1.0], ['hash1', 0], ['hash2', 0]]], [13104, [['16825', 1.0], ['h


 Iteration  8  
 
 
 
 [[15581, [['15581', 1.0], ['hash1', 0], ['hash2', 0]]], [14403, [['14403', 1.0], ['hash1', 0], ['hash2', 0]]], [15187, [['15187', 1.0], ['hash1', 0], ['hash2', 0]]], [14315, [['14315', 1.0], ['hash1', 0], ['hash2', 0]]], [17848, [['17848', 1.0], ['hash1', 0], ['hash2', 0]]], [12584, [['12584', 1.0], ['hash1', 0], ['hash2', 0]]], [12823, [['12823', 1.0], ['hash1', 0], ['hash2', 0]]], [17723, [['17723', 1.0], ['hash1', 0], ['hash2', 0]]], [13145, [['13145', 1.0], ['hash1', 0], ['hash2', 0]]], [14083, [['14083', 1.0], ['hash1', 0], ['hash2', 0]]], [13217, [['13217', 1.0], ['hash1', 0], ['hash2', 0]]], [17252, [['17252', 1.0], ['hash1', 0], ['hash2', 0]]], [16927, [['16927', 1.0], ['hash1', 0], ['hash2', 0]]], [14305, [['14305', 1.0], ['hash1', 0], ['hash2', 0]]], [15797, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [17929, [['17929', 1.0], ['hash1', 0], ['hash2', 0]]], [14765, [['14765', 1.0], ['hash1', 0], ['hash2', 0]]], [12560, [['12560', 1.0], ['hash1', 0], ['h


 Iteration  9  
 
 
 
 [[13631, [['13631', 1.0], ['hash1', 0], ['hash2', 0]]], [16989, [['16989', 1.0], ['hash1', 0], ['hash2', 0]]], [18087, [['18087', 1.0], ['hash1', 0], ['hash2', 0]]], [14560, [['14560', 1.0], ['hash1', 0], ['hash2', 0]]], [13784, [['13784', 1.0], ['hash1', 0], ['hash2', 0]]], [13122, [['13122', 1.0], ['hash1', 0], ['hash2', 0]]], [12461, [['12461', 1.0], ['hash1', 0], ['hash2', 0]]], [16828, [['12461', 1.0], ['hash1', 0], ['hash2', 0]]], [13848, [['13848', 1.0], ['hash1', 0], ['hash2', 0]]], [15400, [['15400', 1.0], ['hash1', 0], ['hash2', 0]]], [16027, [['16027', 1.0], ['hash1', 0], ['hash2', 0]]], [16723, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [16475, [['16475', 1.0], ['hash1', 0], ['hash2', 0]]], [14500, [['14500', 1.0], ['hash1', 0], ['hash2', 0]]], [12901, [['12901', 1.0], ['hash1', 0], ['hash2', 0]]], [16235, [['12461', 1.0], ['hash1', 0], ['hash2', 0]]], [17848, [['17848', 1.0], ['hash1', 0], ['hash2', 0]]], [12823, [['12823', 1.0], ['hash1', 0], ['h


 Iteration  10  
 
 
 
 [[15502, [['15502', 1.0], ['hash1', 0], ['hash2', 0]]], [12457, [['12457', 1.0], ['hash1', 0], ['hash2', 0]]], [12843, [['12843', 1.0], ['hash1', 0], ['hash2', 0]]], [16359, [['16359', 1.0], ['hash1', 0], ['hash2', 0]]], [16989, [['16989', 1.0], ['hash1', 0], ['hash2', 0]]], [16354, [['16354', 1.0], ['hash1', 0], ['hash2', 0]]], [17552, [['17552', 1.0], ['hash1', 0], ['hash2', 0]]], [12381, [['12381', 1.0], ['hash1', 0], ['hash2', 0]]], [17892, [['17892', 1.0], ['hash1', 0], ['hash2', 0]]], [13953, [['13953', 1.0], ['hash1', 0], ['hash2', 0]]], [15187, [['15187', 1.0], ['hash1', 0], ['hash2', 0]]], [17940, [['17940', 1.0], ['hash1', 0], ['hash2', 0]]], [16966, [['16966', 1.0], ['hash1', 0], ['hash2', 0]]], [12798, [['12798', 1.0], ['hash1', 0], ['hash2', 0]]], [17414, [['17414', 1.0], ['hash1', 0], ['hash2', 0]]], [17736, [['17736', 1.0], ['hash1', 0], ['hash2', 0]]], [12657, [['12657', 1.0], ['hash1', 0], ['hash2', 0]]], [13145, [['13145', 1.0], ['hash1', 0], 


 Iteration  11  
 
 
 
 [[12552, [['12691', 1.0], ['hash1', 0], ['hash2', 0]]], [12594, [['12431', 1.0], ['hash1', 0], ['hash2', 0]]], [12431, [['12431', 1.0], ['hash1', 0], ['hash2', 0]]], [13651, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [13759, [['13759', 1.0], ['hash1', 0], ['hash2', 0]]], [13047, [['13047', 1.0], ['hash1', 0], ['hash2', 0]]], [16041, [['16041', 1.0], ['hash1', 0], ['hash2', 0]]], [13983, [['12431', 1.0], ['hash1', 0], ['hash2', 0]]], [12473, [['13047', 1.0], ['hash1', 0], ['hash2', 0]]], [13892, [['13892', 1.0], ['hash1', 0], ['hash2', 0]]], [14522, [['14522', 1.0], ['hash1', 0], ['hash2', 0]]], [14507, [['14507', 1.0], ['hash1', 0], ['hash2', 0]]], [17330, [['17330', 1.0], ['hash1', 0], ['hash2', 0]]], [13079, [['14636', 0.18181818181818182], ['18041', 0.030303030303030304], ['14446', 0.02564102564102564]]], [17425, [['17522', 1.0], ['hash1', 0], ['hash2', 0]]], [15132, [['15473', 0.6666666666666666], ['12993', 0.5], ['12388', 0.3333333333333333]]], [17949, [


 Iteration  12  
 
 
 
 [[12901, [['12901', 0.3333333333333333], ['14544', 0.2], ['17387', 0.16666666666666666]]], [17321, [['16676', 0.6666666666666666], ['14219', 0.5], ['15208', 0.5]]], [14126, [['15573', 0.14285714285714285], ['15351', 0.125], ['17377', 0.10526315789473684]]], [16353, [['16353', 1.0], ['hash1', 0], ['hash2', 0]]], [18144, [['14825', 1.0], ['hash1', 0], ['hash2', 0]]], [16103, [['12476', 0.2222222222222222], ['hash1', 0], ['hash2', 0]]], [14560, [['17961', 1.0], ['hash1', 0], ['hash2', 0]]], [13868, [['15394', 1.0], ['hash1', 0], ['hash2', 0]]], [16989, [['13743', 0.6666666666666666], ['17554', 0.4], ['13199', 0.2857142857142857]]], [17404, [['12384', 1.0], ['hash1', 0], ['hash2', 0]]], [13079, [['13079', 0.6666666666666666], ['12347', 0.4], ['17914', 0.125]]], [14110, [['15694', 0.4], ['17157', 0.4], ['16255', 0.4]]], [16271, [['hash1', 0], ['hash2', 0], ['hash3', 0]]], [17381, [['16478', 0.6666666666666666], ['16656', 0.5], ['12830', 0.5]]], [16513, [['17402', 0.

In [93]:
%%time

def generateF(df):
    users=df["id_user"].unique().tolist()
    months=list(range(1,14))
    F = pd.DataFrame(index=users,columns=months)
    for col in F.columns:
        F[col].values[:] = "DEL"
    F_dict = F.T.to_dict('list')
    
    return F,F_dict

F,F_dict = generate(df)

CPU times: user 241 ms, sys: 3.32 ms, total: 244 ms
Wall time: 247 ms


In [94]:
%%time
def populateF(solution,F_dict):

    for month,match in solution.items():
        for m in match: 
            F_dict[m[0]][month]=m[1][0][0]
    
    
    F = pd.DataFrame.from_dict(F_dict,orient="index")
 
    return F

F = populateF(solution,F_dict)

CPU times: user 12.7 ms, sys: 1.02 ms, total: 13.8 ms
Wall time: 14.9 ms


In [124]:
F.to_csv("F_example.csv", index=True)
ex=pd.read_csv("F_example.csv", index_col="Unnamed: 0")
ex.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
17850,17850,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL
13047,13047,DEL,13047,13047,DEL,13047,13047,DEL,13047,DEL,13047,13047,DEL
12583,12583,12583,12583,12583,DEL,12583,12583,12583,12583,12583,12583,12583,12583
13748,13748,DEL,DEL,DEL,13748,DEL,DEL,DEL,DEL,13748,DEL,DEL,DEL
15100,15100,15100,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL


In [7]:
## fonctionnemnet de la fonction:
## fonction qui prend un hash et un score et le range dans matching score

## parametre d'entree:
## hash: hash dont on a calcule le score
## score: score de mathcing du hash
## matching score: [[hash1, score1],[hash2, score2],[hash3, score3]]

## retour fonction::
## nouveau triplet des hash avec le score de matching le plus eleve

def sort_score(hash, score, matching_score):
    new_score = matching_score[:]

    ## score > score1
    if (score > matching_score[0][1]):
        ## remplacement de hash3 et score3 par hash2 et score2
        new_score[2][0] = new_score[1][0]
        new_score[2][1] = new_score[1][1]
        ## remplacement de hash2 et score2 par hash1 et score1
        new_score[1][0] = new_score[0][0]
        new_score[1][1] = new_score[0][1]
        ## remplacement de hash1 et score1 par score et hash
        new_score[0][0] = hash
        new_score[0][1] = score

    ## score > score2
    elif (score > matching_score[1][1]):
        ## remplacement de hash3 et score3 par hash2 et score2
        new_score[2][0] = new_score[1][0]
        new_score[2][1] = new_score[1][1]
        ## remplacement de hash2 et score2 par hash et score
        new_score[1][0] = hash
        new_score[1][1] = score

    ## score > score3
    elif (score > matching_score[2][1]):
        ## remplacement de hash3 et score3 par hash et score
        new_score[2][0] = hash
        new_score[2][1] = score

    return new_score


## fonctionnement de la fonction:
## fonction qui calcul le score de matching entre deux signatures
## en calculant la proportion d'item present dans les deux signatures

## parametre d'entree:
## clear_signature: signature de l'utilisateur de la base clear
## anon_signature: signature d'un hash de la base anonymisee

## retour fonction:
## score de matching entre 0 et 1

def calcul_matching_score(clear_signature, anon_signature):
    nb_matching_item = 0

    for item in clear_signature:
        i = 0
        while (i < len(anon_signature)):
            if (item == anon_signature[i]):
                nb_matching_item += 1
                break
            i += 1

    return (2 * nb_matching_item) / (len(clear_signature) + len(anon_signature))


In [8]:


## fonctionnement de la fonction:
## fonction qui prend en entrée deux liste de signature TRIEES par taille
## de signature croissante et qui retourne un dictionnaire qui a chaque clé
## id_user associe les trois hash dont les signatures matchent au mieux la sienne

## parametre d'entree:
## clear_signature: [[id_user1, signature1, taille], ..., [id_userN, signatureN, taille]]
## anon_signature: [[hash1, signature1, taille], ..., [hashN, signatureN, taille]]
## seuil: float / int : seuil minimal pour concidere un match quasi parfait
##  diff_taille: différence maximale toleree entre la signature clear et hash

## retour fonction:
## reponse = {id_user1: [[hash1, score1],[hash2, score2],[hash3, score3]]}
## impossible ==> list [[id_user1, [[hash1, score1],[hash2, score2],[hash3, score3]]], ....]

def match_hash_to_user(clear_signature, anon_signature, seuil):
    ## initialisation variables
    anon_size = len(anon_signature)
    resultat_matching = []

    ## parcours des differentes signatures dans clear_signature
    for signature in clear_signature:
        matching_score = [["hash1", 0], ["hash2", 0], ["hash3", 0]]
        i = 0
        while ((i < anon_size) and (matching_score[0][1] < seuil)):
            ## calcul du score de matching (seulement si le hash n'a pas deja ete atttribue avec certitude)
            score =  calcul_matching_score(signature[1], anon_signature[i][1])
            matching_score = sort_score(anon_signature[i][0], score, matching_score)
            i += 1
        resultat_matching.append([signature[0], matching_score])

    return resultat_matching
