In [4]:
import pandas as pd
import numpy as np
import copy as cp
import sys
import pickle
import hashlib as hs
import base64 as b64
import os
import random as rd
import datetime as dt
import threading
import time 
from difflib import SequenceMatcher
from darc_core.metrics import Metrics
from darc_core.preprocessing import round1_preprocessing
from darc_core.utils import check_format_trans_file

In [2]:
%%time
df = pd.read_csv("./ground_truth.csv", parse_dates=["date"])
df["id_user"]=df["id_user"].astype(str)
df["month"]=df["date"].dt.month
df["year"]=df["date"].dt.year

dx = pd.read_csv("./S_JTM_70.csv", parse_dates=["date"])
dx["month"]=dx["date"].dt.month
dx["year"]=dx["date"].dt.year

CPU times: user 1.11 s, sys: 140 ms, total: 1.25 s
Wall time: 1.27 s


In [3]:
def partition(df,y,m,u):
    return df[(df["id_user"]==u) 
              & (df["month"]==m) 
              & (df["year"]==y)]["id_item"]

def extractListForMonth(df,y,m): 
    users = df[(df["month"]==m) 
               & (df["year"]==y)]["id_user"].unique().tolist()
    months =  list(range(1,13))

    key = []
    hashed = []

    for u in users:
        key=[]
        partitioned = partition(df,y,m,u)
        key=partitioned.unique().tolist()
        hashed.append([u,key,len(key)])

    hashed=sorted(hashed, key = lambda x:x[2])
    return hashed

In [7]:
%%time
def generateLists(df):
    lists_all=[]
    lists_all.append(extractListForMonth(df,2010,12))
    for m in range(1,13):
        lists_all.append(extractListForMonth(df,2011,m))
    return lists_all

all_anon = generateLists(dx)        

CPU times: user 5min 30s, sys: 1.45 s, total: 5min 31s
Wall time: 27min 11s


In [8]:
all_clear=generateLists(df)

In [5]:
with open ('all_clear', 'rb') as ac:
    all_clear = pickle.load(ac)

In [11]:
%%time
def makeMatches(all_clear,all_anon):

    solution = {}
    solution[0] = match_hash_to_user(all_clear[0], all_anon[0], 0.9)
    print("Iteration 1/13 \n")
    
    for i in range(1,(len(all_anon))):
        solution[i] = match_hash_to_user(all_clear[i], all_anon[i], 0.9)
        print("Iteration ", (i+1) , "/13 \n")
        
    return solution

solution = makeMatches(all_clear,all_anon)

Iteration 1/13 

Iteration  2 /13 

Iteration  3 /13 

Iteration  4 /13 

Iteration  5 /13 

Iteration  6 /13 

Iteration  7 /13 

Iteration  8 /13 

Iteration  9 /13 

Iteration  10 /13 

Iteration  11 /13 

Iteration  12 /13 

Iteration  13 /13 

CPU times: user 15min, sys: 5.95 s, total: 15min 6s
Wall time: 17min 33s


In [13]:
%%time

def generateF(df):
    users=df["id_user"].unique().tolist()
    months=list(range(1,14))
    F = pd.DataFrame(index=users,columns=months)
    for col in F.columns:
        F[col].values[:] = "DEL"
    F_dict = F.T.to_dict('list')
    
    return F,F_dict

F,F_dict = generateF(df)

CPU times: user 310 ms, sys: 123 ms, total: 433 ms
Wall time: 434 ms


In [189]:
F_dict["13748"]

['DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL',
 'DEL']

In [14]:
%%time
    def populateF(solution,F_dict):

        for month,match in solution.items():
            for m in match: 
                F_dict[str(m[0])][month]=m[1][0][0]


        F = pd.DataFrame.from_dict(F_dict,orient="index")

        return F

F = populateF(solution,F_dict)

CPU times: user 22.9 ms, sys: 2.75 ms, total: 25.7 ms
Wall time: 26.7 ms


In [192]:
F.to_csv("F.csv", index=True)
ex=pd.read_csv("F.csv", index_col="Unnamed: 0")
ex.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
17850,b707fbff42a17f9503f6e59e6c6f10953742325704645a...,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL
13047,f763c320ffb13f7d8de5a20fc71baf2da17617a8220cd8...,DEL,d6eb683af61a075c52e617d11c8311e47d00a337978d3a...,d9dc64f0072e06dbe2b0e3327d057ad9eea73e770066b1...,DEL,bbd1be10c10d4daa8c583197df16a51488ae61f7744362...,f64da5164902791dcbfa0c0a55f83a16b314919d994fd3...,DEL,1e58cc63b9869bf376a52f8982d9afa593d4438ab00c6f...,DEL,9a93225d9f198ed15982ecc0f8bab7624d1467e19344bc...,9d27e55c5dce0f0fa42f5641ead72309304d77b29913c6...,DEL
12583,e581ea29bf72e645a4eba92fd45ce876c2626967c95210...,e04630c03b03e20b7f707010a39d6e4ae68377a17a109a...,73e4cb6a68b315f51912cf3b8f255623eef5c66a6f9753...,3b26fee8740368e3542945230e7c130756ed6c75dc0844...,DEL,8081ae2bb424fbcbfbbecef6810590c3502f191ab294e1...,8236e758025608d258927ea293a8ee66ac1af2c2e48733...,82b3b805d906ade9eded79ddfcdd98b314c2f06abbbae9...,ae502404635d14c40e13fd5871f73bd67d68ec5b8b53f6...,27a1ac205d3ceff16c50a2ae442254967262cde8d65997...,59bbbb68ed2ca8ae0d398a2d71d2433b6286067e95993c...,1ce24bb6a6dddb89a81a9d3e55ea5843a233e26deb2efc...,088e1ca9e18318ee5be052b95e53327aaa49b29df2e1a8...
13748,64728c2373fc5390bf5cc5ab0c51a1cbb22ea5f52c4af6...,DEL,DEL,DEL,8e14ef6a2255666247f4b1506a14b2a7567291bdcf2640...,DEL,DEL,DEL,DEL,9d2dce0ac11d53b6f131e9edb733a8747eb3fa1ba784a8...,DEL,DEL,DEL
15100,c1b5aced20a9108f91c554f900f40f0eaba0b81f7bd85f...,hash1,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL


In [15]:
F_Real = pd.read_csv("F_noise.csv", index_col="Unnamed: 0")

In [16]:
months=list(range(1,14))
month_dict={ i : months[i] for i in range(0, len(months) ) }
F = F.rename(columns=month_dict)  

In [17]:
F_Real=F_Real.rename(columns=month_dict)

In [18]:
F.sort_index(ascending=True, inplace=True)
F_Real.sort_index(ascending=True, inplace=True)

In [19]:
array=F_Real.values==F.values

In [20]:
n=F[1].count()*13
nprime=n
for a in array:
    for boolean in a:
        if not boolean:
            nprime-=1
score_reid = nprime/n
print(score_reid)

0.9974829335265627


In [22]:
F_Real

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
12347,578969f71f0b8a83d2bf1a879603c933a21e39f0655151...,aa69bf31f85a8ad8df44a3fb97adab5f21dbdde91d4504...,DEL,DEL,5e0e95b8986b37aa7a69246b4d752322ff4931df33cc63...,DEL,487a727365aa76996e7ab4115bf565df5f17a91171b948...,DEL,e02b237702cc76ddbead3c9c5cdc672797acabc39a4e2e...,DEL,42efeaaaf958d6c199ab3d23fd8e303091008ff866552c...,DEL,3c57ee096a64ca050d6de789baa8c2f8f306a3167dd7cf...
12348,42b3896824200c55bc5422b289c2fe74c053279d8a7908...,9228ac08de29a21f53e370010efbfc7d38da9ca42ab548...,DEL,DEL,762ed61de08526f1b81604bac780da127f9d2a7677c545...,DEL,DEL,DEL,DEL,5b548347f268148a183a4c0009737bc844956619984d53...,DEL,DEL,DEL
12349,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,3d874e48af2b0240fd7667cdbed1fdfa52657e20a012d4...,DEL
12350,DEL,DEL,492d9af9308dfee5f8e0000866976f0aeabb335b10338f...,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL
12352,DEL,DEL,355a05799fc2709919f2f239a208cc0406980c78b12097...,c815d804f377240820ff5e9e3aa8fd4909cd0e17e698bd...,DEL,DEL,DEL,DEL,DEL,611794f7de27ce515ebe09a3a579a9f30c0cd58de81afe...,DEL,a55447a244a0085a4dd52e18cc04424d811f9331be2b5c...,DEL
12354,DEL,DEL,DEL,DEL,1759b36fdfda0a34d00bbe9ba375f8c7ae344669b72dc6...,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL
12355,DEL,DEL,DEL,DEL,DEL,26c8c97d8bc1988a5a0e30424614677bfdbe19de07bf8d...,DEL,DEL,DEL,DEL,DEL,DEL,DEL
12356,DEL,c9041c5fc649da6f9e562da742ba68440548c82d4f5e81...,DEL,DEL,da43b470bb0a19ba4f011695a3827316680534358fd60d...,DEL,DEL,DEL,DEL,DEL,DEL,f8e58bfa449c5bacb5ff79900bb07ecfa8511f96a05b99...,DEL
12357,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,DEL,7b9ed797dc0098028457a3a0315411bf093f620d2d1a85...,DEL
12358,DEL,DEL,DEL,DEL,DEL,DEL,DEL,a6b572fe4bdb82d10023f5b47331770be6bbd189abf372...,DEL,DEL,DEL,DEL,5d348dc60c68ff3b92303927853e5377daab4b2d6694c6...


In [9]:
def sort_score(hash, score, matching_score):
    new_score = matching_score[:]

    ## score > score1
    if (score > matching_score[0][1]):
        ## remplacement de hash3 et score3 par hash2 et score2
        new_score[2][0] = new_score[1][0]
        new_score[2][1] = new_score[1][1]
        ## remplacement de hash2 et score2 par hash1 et score1
        new_score[1][0] = new_score[0][0]
        new_score[1][1] = new_score[0][1]
        ## remplacement de hash1 et score1 par score et hash
        new_score[0][0] = hash
        new_score[0][1] = score

    ## score > score2
    elif (score > matching_score[1][1]):
        ## remplacement de hash3 et score3 par hash2 et score2
        new_score[2][0] = new_score[1][0]
        new_score[2][1] = new_score[1][1]
        ## remplacement de hash2 et score2 par hash et score
        new_score[1][0] = hash
        new_score[1][1] = score

    ## score > score3
    elif (score > matching_score[2][1]):
        ## remplacement de hash3 et score3 par hash et score
        new_score[2][0] = hash
        new_score[2][1] = score

    return new_score

def calcul_matching_score(clear_signature, anon_signature):
    nb_matching_item = 0
    copy_anon=anon_signature.copy()
    for item in clear_signature:
        i = 0
        while (i < len(copy_anon)):
            if (item == copy_anon[i]):
                nb_matching_item += 1
                copy_anon.pop(i)
                break
            i += 1

    return (2 * nb_matching_item) / (len(clear_signature) + len(anon_signature))


In [10]:
def match_hash_to_user(clear_signature, anon_signature, seuil):
    
    ## initialisation variables
    anon_size = len(anon_signature)
    resultat_matching = []

    ## parcours des differentes 
    ##signatures dans clear_signature
    for signature in clear_signature:
        matching_score = [["hash1", 0], 
                          ["hash2", 0], 
                          ["hash3", 0]]
        i = 0
        while ((i < anon_size) and 
               (matching_score[0][1] < seuil)):
            ## calcul du score de matching 
            #(seulement si le hash n'a pas deja ete atttribue avec certitude)
            score = calcul_matching_score(signature[1]
                                    , anon_signature[i][1])
            matching_score = sort_score(anon_signature[i][0]
                                    , score, matching_score)
            i += 1
        resultat_matching.append([signature[0], 
                                  matching_score])

    return resultat_matching
