In [None]:
!pip install ua_parser
!pip install python-Levenshtein
!pip install tensorflow-addons

In [None]:
SHOULD_COMPUTE_EMBEDDINGS = False
IS_DEMO = True

In [None]:
###############################################################
###############################################################
############# IMPORTS #########################################
###############################################################

In [None]:
from ua_parser import user_agent_parser
import re
import math as mt
import hashlib
import pandas as pd
import itertools
import numpy as np
import datetime
from Levenshtein import ratio as levenshtein_ratio
from functools import reduce
from sklearn.ensemble import RandomForestClassifier
import joblib 
import uuid
from joblib import Parallel, delayed
import json
from statistics import mean
from numpy import dot
from numpy.linalg import norm

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import layers, losses
import pickle
import tensorflow_addons as tfa

In [None]:
###############################################################
###############################################################
############# MAIN CODE #######################################
###############################################################

In [None]:
class Fingerprint():
    ID = "id"
    COUNTER = "counter"
    CREATION_TIME = "creationdate"
    END_TIME = "enddate"
    
    # HTTP attributes
    ACCEPT_HTTP = "accepthttp"
    LANGUAGE_HTTP = "languagehttp"
    USER_AGENT_HTTP = "useragenthttp"
    ADDRESS_HTTP = "addresshttp"
    CONNECTION_HTTP = "connectionhttp"
    ENCODING_HTTP = "encodinghttp"
    HOST_HTTP = "hosthttp"
    
    BROWSER_FAMILY = "browserFamily"
    MINOR_BROWSER_VERSION = "minorBrowserVersion"
    MAJOR_BROWSER_VERSION = "majorBrowserVersion"
    GLOBAL_BROWSER_VERSION = "globalBrowserVersion"
    OS = "os"
    
    COOKIES_JS = "cookiesjs"
    RESOLUTION_JS = "resolutionjs"
    TIMEZONE_JS = "timezonejs"
    PLUGINS_JS = "pluginsjs"
    PLUGINS_JS_HASHED = "pluginsjshashed"
    SESSION_JS = "sessionjs"
    DNT_JS = "dntjs"
    IE_DATA_JS = "iedatajs"
    CANVAS_JS_HASHED = "canvasjshashed"
    LOCAL_JS = "localjs"
    PLATFORM_JS = "platformjs"
    AD_BLOCK = "adblock"
    RENDERER = "rendererwebgljs"
    VENDOR = "vendorwebgljs"
    
    PLATFORM_INCONSISTENCY = "platformInconsistency"
   
    LANGUAGE_INCONSISTENCY = "languageInconsistency"
    

In [None]:
LAMBDA = 0.992 
WEBGL = True
NB_DAYS = 7

In [None]:


JAVASCRIPT_ATTRIBUTES = [Fingerprint.COOKIES_JS, Fingerprint.RESOLUTION_JS, Fingerprint.TIMEZONE_JS,
                             Fingerprint.PLUGINS_JS, Fingerprint.SESSION_JS, Fingerprint.DNT_JS, 
                             Fingerprint.CANVAS_JS_HASHED, Fingerprint.LOCAL_JS, Fingerprint.PLATFORM_JS,
                             Fingerprint.PLATFORM_INCONSISTENCY, Fingerprint.IE_DATA_JS,
                             Fingerprint.PLUGINS_JS_HASHED, Fingerprint.VENDOR, Fingerprint.RENDERER]

REQUIRED_ATTRIBUTES = [Fingerprint.COUNTER, Fingerprint.COOKIES_JS, Fingerprint.RESOLUTION_JS, Fingerprint.TIMEZONE_JS,
                       Fingerprint.PLUGINS_JS, Fingerprint.SESSION_JS, Fingerprint.DNT_JS, Fingerprint.ACCEPT_HTTP,
                       Fingerprint.CANVAS_JS_HASHED, Fingerprint.LOCAL_JS, Fingerprint.PLATFORM_JS, Fingerprint.ENCODING_HTTP,
                       Fingerprint.IE_DATA_JS, Fingerprint.PLUGINS_JS_HASHED, Fingerprint.VENDOR, Fingerprint.RENDERER,
                      Fingerprint.END_TIME]

RIGID_ATTRIBUTES = [Fingerprint.LOCAL_JS, 
                    Fingerprint.DNT_JS, 
                    Fingerprint.COOKIES_JS]

MODEL_ATTRIBUTES = [
    Fingerprint.CANVAS_JS_HASHED,
    Fingerprint.CREATION_TIME,
    Fingerprint.ID,
    Fingerprint.LANGUAGE_HTTP,
    Fingerprint.PLUGINS_JS,
    Fingerprint.RENDERER,
    Fingerprint.RESOLUTION_JS,
    Fingerprint.TIMEZONE_JS,
    Fingerprint.USER_AGENT_HTTP,
]

WEBGL_ATTRIBUTES = ["embeddings_webgl"]

REQUIRED_ATTRIBUTES += list(set(MODEL_ATTRIBUTES + RIGID_ATTRIBUTES + WEBGL_ATTRIBUTES)) if WEBGL else list(set(MODEL_ATTRIBUTES + RIGID_ATTRIBUTES))

def md5_encode(row, *attrs):
    to_encode = ""
    for attr in attrs:
        if isinstance(attr, list):
            for subattr in attr:
                to_encode += str(row[subattr])
        else:
            to_encode += str(row[attr])
        
    return hashlib.md5(str.encode(to_encode)).hexdigest()

def check_platform_inconsistencies(row):
        if row[Fingerprint.TIMEZONE_JS] != "no JS":
            try:
                platform_user_agent = row[Fingerprint.OS][0:3].lower()
                platform_js = row[Fingerprint.PLATFORM_JS][0:3].lower()
                
                if (platform_user_agent in ("lin", "ubu", "ios", "and")) and ".dll" in row[Fingerprint.PLUGINS_JS]:
                    return True
                if platform_user_agent.startswith("ip") and "flash" in row[Fingerprint.PLUGINS_JS].lower() > -1:
                    return True   
                if (platform_user_agent in ("win", "mac", "ios")) and ".so" in row[Fingerprint.PLUGINS_JS] > -1:
                    return True
                if (platform_user_agent in ("ubu", "win", "lin")) and ".plugin" in row[Fingerprint.PLUGINS_JS] > -1:
                    return True
                    
                inconsistency = not(platform_js == platform_user_agent)
                if platform_js == "lin" and platform_user_agent == "and":
                    inconsistency = False
                elif platform_js == "lin" and platform_user_agent == "ubu":
                    inconsistency = False
                elif platform_js == "x64" and platform_user_agent == "win":
                    inconsistency = False
                elif platform_js == "ipa" and platform_user_agent == "ios":
                    inconsistency = False
                elif platform_js == "iph" and platform_user_agent == "ios":
                    inconsistency = False
                elif platform_js == "" and platform_user_agent == "":
                    inconsistency = True

                elif platform_js == "lin" and platform_user_agent == "and":
                    inconsistency = False
                elif platform_js == "lin" and platform_user_agent == "ubu":
                    inconsistency = False
                elif platform_js == "x64" and platform_user_agent == "win":
                    inconsistency = False
                elif platform_js == "ipa" and platform_user_agent == "ios":
                    inconsistency = False
                elif platform_js == "iph" and platform_user_agent == "ios":
                    inconsistency = False
                elif platform_js == "ipo" and platform_user_agent == "ios":
                    inconsistency = False
                elif row[Fingerprint.OS] == "Windows Phone" and platform_js == "arm":
                    inconsistency = False
                elif platform_js == "arm" and "SIM" in row[Fingerprint.USER_AGENT_HTTP] > -1:
                    inconsistency = False
                elif platform_user_agent == "chr" and platform_js == "lin":
                    inconsistency = False
                elif "Touch" in row[Fingerprint.USER_AGENT_HTTP] > -1 and platform_js == "arm":
                    inconsistency = False
                elif platform_user_agent == "oth":
                    inconsistency = False
                elif platform_js == "" and platform_user_agent == "":
                    inconsistency = True

                return inconsistency
            except:
                return True
        else:
            raise ValueError("Javascript is not activated")
            

def check_id_consistence(fps_df):
    
    # Various attributes checks
    uid_inconsistencies = dict()
    uid_fps_counter = dict()
    for index, fp in fps_df.iterrows():
        print("Handling index {}".format(index), end="\r")
        uid = fp.id
        try:
            fps_user = fps_df[(fps_df.id == uid) & (fps_df.creationdate <= fp.creationdate)]

            browsers = set(fps_user[Fingerprint.BROWSER_FAMILY].values)
            oses = set(fps_user[Fingerprint.OS].values)
            if len(browsers) > 1 or len(oses) > 1:
                uid_inconsistencies[uid] = 100000000
            
            if fp[Fingerprint.OS] in ("Android", "iOS", "Windows Phone", "Firefox OS", "Windows 95"):
                uid_inconsistencies[uid] = 10000000000

            if fp[Fingerprint.BROWSER_FAMILY] in ("Safari", "IE", "Edge", "Googlebot"):
                uid_inconsistencies[uid] = 10000000
            
            if fp[Fingerprint.PLATFORM_INCONSISTENCY]:
                if uid in uid_inconsistencies.keys():
                    uid_inconsistencies[uid] += 5
                else:
                    uid_inconsistencies[uid] = 5

            if not uid in uid_inconsistencies.keys():
                uid_inconsistencies[uid] = 0
            
            if uid in uid_fps_counter.keys():
                uid_fps_counter[uid] += 1
            else:
                uid_fps_counter[uid] = 1
                
        except Exception as e:
            print(e)
            uid_inconsistencies[uid] = 1000000
    consistent_uids = []
    for uid, counter in uid_fps_counter.items():
        score = float(uid_inconsistencies[uid]) / float(counter)
        if score < 0.02:
            consistent_uids.append(uid)
    print("Nb consistent", len(consistent_uids))
    # Canvas check
    g = fps_df[[Fingerprint.ID, Fingerprint.CANVAS_JS_HASHED]].groupby("id")
    aggregated_canvas = g.agg(["nunique", "count"])
    aggregated_canvas["rapport"] = aggregated_canvas[Fingerprint.CANVAS_JS_HASHED]["nunique"]/aggregated_canvas[Fingerprint.CANVAS_JS_HASHED]["count"]
    
    poisoner_ids = aggregated_canvas[(aggregated_canvas["rapport"] > 0.35) & (aggregated_canvas[Fingerprint.CANVAS_JS_HASHED]["count"] > 5)]
    consistent_uids = list(filter(lambda x: x not in poisoner_ids.index.values, consistent_uids))
    
    return consistent_uids
         
def dataset_to_dataframe(fingerprint_dataset):
    allowed_attributes = list(set(map(lambda x: x.lower(), REQUIRED_ATTRIBUTES)))
    df = pd.DataFrame(fingerprint_dataset)[allowed_attributes]
    
    df["parsed_ua"] = df[Fingerprint.USER_AGENT_HTTP].apply(lambda x: user_agent_parser.Parse(x))
    df[Fingerprint.BROWSER_FAMILY] = df.parsed_ua.apply(lambda x: x["user_agent"]["family"])
    df[Fingerprint.MINOR_BROWSER_VERSION] = df.parsed_ua.apply(lambda x: x["user_agent"]["minor"])
    df[Fingerprint.MAJOR_BROWSER_VERSION] = df.parsed_ua.apply(lambda x: x["user_agent"]["major"])
    df[Fingerprint.GLOBAL_BROWSER_VERSION] = df[Fingerprint.MAJOR_BROWSER_VERSION] +  df[Fingerprint.MINOR_BROWSER_VERSION]
    df[Fingerprint.OS] = df.parsed_ua.apply(lambda x: x["os"]["family"])
    
    df[Fingerprint.PLATFORM_INCONSISTENCY] = df.apply(lambda row: check_platform_inconsistencies(row), axis=1)
        
    df["constant_hash"] = df.apply(lambda row: md5_encode(row, Fingerprint.OS,
                                                               Fingerprint.PLATFORM_JS,
                                                               Fingerprint.BROWSER_FAMILY), axis=1)
    
    df["general_hash"] = df.apply(lambda row: md5_encode(row, JAVASCRIPT_ATTRIBUTES,
                                                              Fingerprint.USER_AGENT_HTTP,
                                                              Fingerprint.LANGUAGE_HTTP,
                                                              Fingerprint.ACCEPT_HTTP,
                                                              Fingerprint.ENCODING_HTTP
                                                        ), axis=1)
    
    return df
            
def generate_replay_sequence_pd(fps_df, visit_frequency):
    uids = fps_df.id.unique()
    uid_to_sequence = dict()
    progress = 0
    for uid in uids:
        list_fps = fps_df[fps_df.id == uid]
        if len(list_fps) > 1:
            list_fps = list_fps.sort_values("creationdate", ascending=True)
            list_fps = list_fps.iloc[:-1]
            
            last_visit = list_fps.iloc[0].creationdate
            dates = [last_visit]
            sequence = []
            
            counter_str = "{}_i".format(list_fps.iloc[0].counter)
            sequence.append((counter_str, last_visit))
            
            for _, fp in list_fps.iterrows():
                variations_counter = 0
                if not pd.isnull(fp["enddate"]):
                    if len(dates) > 0:
                        last_visit = dates[-1] + datetime.timedelta(days=visit_frequency)
                        
                    frequency_str = "{}D".format(visit_frequency)
                    dates = pd.date_range(last_visit, fp.enddate, freq=frequency_str, closed="left").tolist()
                    
                    counters_str = ["{}_{}".format(fp.counter, variations_counter + index) \
                                    for index in range(len(dates))]
                    current_sequence = [(cntr, date) for cntr, date in zip(counters_str, dates)]
                    sequence += current_sequence
                    variations_counter += len(dates) - 1
            uid_to_sequence[uid] = sequence  
        progress += 1
        print("Handling uid {}/{}".format(progress, len(uids)), end="\r")
        
    replay_sequence = list(itertools.chain(*uid_to_sequence.values()))
    print(len(replay_sequence))
    return sorted(replay_sequence, key=lambda x: x[1])


def getTimeDifference(fp1, fp2):
        try:
            diff = fp1[Fingerprint.CREATION_TIME] - fp2[Fingerprint.CREATION_TIME]
            return mt.fabs(diff.days + diff.seconds / (3600.0 * 24))
        except:  # for the case where we try to link blink's fingerprints
            return fp1[Fingerprint.COUNTER] - fp2[Fingerprint.COUNTER]

def compute_similarity_vector(fp1, fp2, training=True):
    similarity_vector = [] 
    
    y = 0
    if training and fp1[Fingerprint.ID] == fp2[Fingerprint.ID]:
        y = 1
    
    time_difference = getTimeDifference(fp1, fp2)
    similarity_vector.append(time_difference)
    
    
    if fp1[Fingerprint.TIMEZONE_JS] == fp2[Fingerprint.TIMEZONE_JS]:
        similarity_vector.append(1)
    else:
        similarity_vector.append(0)
    
    
    if fp1[Fingerprint.RESOLUTION_JS] == fp2[Fingerprint.RESOLUTION_JS]:
        similarity_vector.append(1)
    else:
        similarity_vector.append(0)
        
    if fp1[Fingerprint.CANVAS_JS_HASHED] == fp2[Fingerprint.CANVAS_JS_HASHED]:
        similarity_vector.append(1)
    else:
        similarity_vector.append(0)
        
        
        
    already_tested = [Fingerprint.ID, Fingerprint.CANVAS_JS_HASHED, 
                      Fingerprint.RESOLUTION_JS, 
                      Fingerprint.TIMEZONE_JS, 
                      Fingerprint.CREATION_TIME, Fingerprint.COUNTER]
    # I start this at one to take care of the fact that the creation time diff will always add one
    nb_changes = reduce(lambda agg, curr: agg + 1 if curr == 0 else agg, similarity_vector, 1)
    
    
    remaining_attributes = list(filter(lambda x: x not in already_tested, MODEL_ATTRIBUTES))
    for attribute in remaining_attributes:
        if fp1[attribute] != fp2[attribute]:
            nb_changes += 1
        similarity_ratio = levenshtein_ratio(fp1[attribute], fp2[attribute])
        similarity_vector.append(similarity_ratio)
        
    
    if not training and nb_changes > 5:
        return np.array([]), np.array([])
    
    similarity_vector.append(nb_changes)
            
    return np.array(similarity_vector),  np.array([y])

def random_sample(arr: np.array, avoid: str = None) -> np.array:
    if avoid:
        arr = arr[arr != avoid]
    return arr[np.random.choice(len(arr), size=1, replace=False)][0]

def train_ml(train_data, load=True, webgl_in_ml=False, model_path="./my_ml_model"):
    if load:
        model = joblib.load(model_path)
        return model
    
    X_train, y_train = [], []
    for visit_frequency in range(1, 10):
        print("Generating dataset for visit frequency of {} days".format(visit_frequency))
        replay_sequence = generate_replay_sequence_pd(train_data, visit_frequency)
        train_counters = list(map(lambda x: int(x[0].split('_')[0]), replay_sequence))
        
        filtered_train_data = train_data[train_data[Fingerprint.COUNTER].isin(train_counters)]
        
        list_uids = filtered_train_data.id.unique()
        for uid in list_uids:
            uid_train_data = filtered_train_data[filtered_train_data.id == uid]
            uid_train_data = uid_train_data.sort_values(Fingerprint.CREATION_TIME)
            for i in range(1, len(uid_train_data)):
                mod_attributes = MODEL_ATTRIBUTES + [WEBGL_ATTRIBUTE] if webgl_in_ml else MODEL_ATTRIBUTES
                fp1 = uid_train_data[mod_attributes].iloc[i]
                fp2 = uid_train_data[mod_attributes].iloc[i-1]
                X, y = compute_similarity_vector(fp1, fp2)
                if webgl_in_ml:
                    X = np.concatenate((X, fp1[WEBGL_ATTRIBUTE], fp2[WEBGL_ATTRIBUTE]))
                X_train.append(X)
                y_train.append(y)
            
            for _, fp in uid_train_data.iterrows():
                mod_attributes = MODEL_ATTRIBUTES + [WEBGL_ATTRIBUTE] if webgl_in_ml else MODEL_ATTRIBUTES
                compared_uid = random_sample(list_uids, avoid=fp.id)
                compared_uid_df = filtered_train_data[filtered_train_data.id == compared_uid][mod_attributes]
                
                compared_fp = compared_uid_df.iloc[random_sample(np.array(range(len(compared_uid_df))))][mod_attributes]
                X, y = compute_similarity_vector(fp, compared_fp)
                if webgl_in_ml:
                    X = np.concatenate((X, fp[WEBGL_ATTRIBUTE], compared_fp[WEBGL_ATTRIBUTE]))
                X_train.append(X)
                y_train.append(y)
        
        
    print("Generated dataset. Initiating model...", end="\r")
    model = RandomForestClassifier(n_jobs=6, random_state=2)
    print("Initiated model. Training...", end="\r")
    model.fit(X_train, y_train)
    print("Trained model. Saving...", end="\r")
    joblib.dump(model, model_path)
    print("Saved model. Training phase finished.")

    return model    

In [None]:
def generate_id():
    return str(uuid.uuid4())

def check_attributes_consistency(infered_fp, compared_fp):
    global RIGID_ATTRIBUTES
    
    if infered_fp[Fingerprint.GLOBAL_BROWSER_VERSION] < compared_fp[Fingerprint.GLOBAL_BROWSER_VERSION]:
        return -1
    
    rigid_changes = reduce(lambda agg, curr_attr: agg | True if infered_fp[curr_attr] != compared_fp[curr_attr] else agg | False,
                           RIGID_ATTRIBUTES,
                           False)
    if rigid_changes:
        return -2
    
    return 0

def compute_distance_embeddings(embedding_1, embedding_2, dist_type="cosine"):
    if dist_type == "cosine":
        cos_sim = dot(embedding_1, embedding_2)/(norm(embedding_1)*norm(embedding_2))
        return cos_sim
    elif dist_type == "euclidean":
        return np.linalg.norm(np.array(embedding_1) - np.array(embedding_2))
    else:
        raise Exception("Distance type not supported.")
        
def infer_fingerprint(infered_fp, fps_df_dict, new_ids_df, threshold,  webgl_in_model=False, drawnapart=True):
   
    
    potential_candidates = []
    exact_candidates = []
    
    list_uids = new_ids_df.keys()
    
    for uid in list_uids:
        for assigned_counter in new_ids_df[uid]:
            known_counter_nbr = int(assigned_counter.split("_")[0])
            known_fp = fps_df_dict[known_counter_nbr]
            if infered_fp.general_hash == known_fp["general_hash"]:
                exact_candidates.append({
                    "indexed_counter": known_counter_nbr,
                    "linked_id": uid
                })
            
            elif len(exact_candidates) == 0 and infered_fp.constant_hash == known_fp["constant_hash"]:
                attributes_check = check_attributes_consistency(infered_fp, known_fp)
                if attributes_check == -1: 
                    continue
                elif attributes_check == -2:
                    break
                potential_candidates.append({
                    "indexed_counter": known_counter_nbr,
                    "linked_id": uid
                })
    
                
    if len(exact_candidates) > 0:
        exact_uids = list(set(map(lambda x: x["linked_id"], exact_candidates)))
        if len(exact_uids) == 1:
            return exact_uids[0]
        
    elif len(potential_candidates) > 0:
        
        # Inference over machine learning occurs here
        updated_potential_candidates = []
        X_infer = []
        cosine_distances = []
        for potential_candidate in potential_candidates:
            known_fp = fps_df_dict[potential_candidate["indexed_counter"]]
            X, _ = compute_similarity_vector(infered_fp[MODEL_ATTRIBUTES], known_fp,  training=False)
            
            if len(X) > 0:
                distance_embeddings = compute_distance_embeddings(infered_fp["embeddings_webgl"], known_fp["embeddings_webgl"])
                if webgl_in_model:
                    X = np.concatenate((X, infered_fp[WEBGL_ATTRIBUTE], known_fp[WEBGL_ATTRIBUTE]))
                cosine_distances.append(distance_embeddings)
                X_infer.append(X)
                updated_potential_candidates.append(potential_candidate)
                
        if len(updated_potential_candidates) > 0:
            inference = model.predict_proba(X_infer)
            inference_positive_class = np.array(list(map(lambda x: x[1], inference)))

            top_three = inference_positive_class.argsort()[::-1][:3]
            
            if drawnapart:
                if cosine_distances[top_three[0]] > 0.15:
                    return updated_potential_candidates[top_three[0]]["linked_id"]
            
            second_closest_index = 1
            second_closest_probability = None
            for index, closest_index in enumerate(top_three[1:]):
                if inference_positive_class[top_three[0]] != inference_positive_class[closest_index]:
                    second_closest_index = index + 1
                    second_closest_probability = inference_positive_class[closest_index]
                    break


            is_difference_sufficient = True
            if second_closest_probability and (inference_positive_class[top_three[0]] - second_closest_probability) < 0.3:
                is_difference_sufficient = False

            closest_candidates = list(set(map(lambda x: updated_potential_candidates[x]["linked_id"], top_three[:second_closest_index])))
            if is_difference_sufficient and inference_positive_class[top_three[0]] > threshold and len(closest_candidates) == 1:
                return updated_potential_candidates[top_three[0]]["linked_id"]
            
    
    return generate_id()
            

def cleanup_fingerprints(generated_ids, counter_time, timestamp):
    updated_generated_ids = {}
    for assigned_id, counter_str in generated_ids.items():
        tmp_time = counter_time[counter_str[-1]]
        if timestamp - tmp_time <= pd.Timedelta("40 days"):
            updated_generated_ids[assigned_id] = counter_str
    
    return updated_generated_ids
        
        
    
def generate_replay_scenario(test_data, visit_frequency, model, webgl_in_model=False, drawnapart=True,
                             save="./scenario_replay_result.csv"):
    
    replay_sequence = generate_replay_sequence_pd(test_data, visit_frequency)
    test_data = test_data.set_index(Fingerprint.COUNTER)
    
    fps_df_dict = test_data.to_dict('index')
    
    counter_time = dict((cntr, timestmp) for cntr, timestmp in replay_sequence)
    generated_ids = {}
    linked_ids = {}
    for index, counter_object in enumerate(replay_sequence):
        print("Step {}/{}".format(index, len(replay_sequence)), end="\r")
        counter_str, timestamp = counter_object
        counter_nbr = int(counter_str.split("_")[0])
        
        fp_to_infer = test_data[test_data.index.values == counter_nbr].iloc[0]
        infered_id = infer_fingerprint(fp_to_infer, fps_df_dict, generated_ids, LAMBDA, webgl_in_model, drawnapart)
        
        linked_ids[counter_str] = infered_id
        
        if infered_id not in generated_ids.keys():
            generated_ids[infered_id] = []
        
        generated_ids[infered_id] = generated_ids[infered_id][-1:]
        generated_ids[infered_id].append(counter_str)
    
        if index % 2000 == 0:
            generated_ids = cleanup_fingerprints(generated_ids, counter_time, timestamp)
    return linked_ids

In [None]:
def evaluate_sequence(linked_ids, fps_df):
    fps_df = fps_df.set_index(Fingerprint.COUNTER)
    id_to_stats = {}
    for counter_str, infered_id in linked_ids.items():
        counter_nbr = int(counter_str.split("_")[0])
        fp_id = fps_df.loc[counter_nbr][Fingerprint.ID]
        
        if not fp_id in id_to_stats.keys():
            id_to_stats[fp_id] = {
                "infered_id_set": set(),
                "infered_ids": {},
                "counters": []
            }
            
        id_to_stats[fp_id]["infered_id_set"].add(infered_id)
        if infered_id not in id_to_stats[fp_id]["infered_ids"]:
            id_to_stats[fp_id]["infered_ids"][infered_id] = []
        id_to_stats[fp_id]["infered_ids"][infered_id].append(counter_nbr)
        id_to_stats[fp_id]["counters"].append(counter_str)
    
    chain_lengths = {}
    avg_chain_lengths = {}
    perfect_track = {}
    for uid in id_to_stats.keys():
        tmp_lengths = {}
        infered_ids = id_to_stats[uid]["infered_id_set"]
        for infered_id in infered_ids:
            nb_fps = list(filter(lambda x: fps_df.loc[x][Fingerprint.ID] == uid, id_to_stats[uid]["infered_ids"][infered_id]))
            tmp_lengths[infered_id] = len(nb_fps)  
        chain_lengths[uid] = max(tmp_lengths.items(), key=lambda x: x[1])[1]
        avg_chain_lengths[uid] = mean(tmp_lengths.values())
        perfect_track[uid] = len(id_to_stats[uid]["counters"])
    return id_to_stats, chain_lengths, avg_chain_lengths, perfect_track

In [None]:
###############################################################
###############################################################
############# LOADING CODE ####################################
###############################################################

In [None]:
!wget https://github.com/drawnapart/drawnapart/raw/master/bogus_dataset_for_fps.tgz
!tar -zxvf bogus_dataset_for_fps.tgz

In [None]:
df = pd.read_csv('bogus_dataset.tsv', sep='\t')
df['creationdate'] = pd.to_datetime(df['creationdate'])  
df['enddate'] = pd.to_datetime(df['enddate'])  
df['embeddings_webgl'] = df['embeddings_webgl'].apply(json.loads)

print("Dataframe length:", df.shape[0])

In [None]:
def get_clf_model():
    DROPOUT_SIZE = 0.119510

    clf_model = tf.keras.Sequential([
    tf.keras.layers.Input((32, 32, 1)),

    tf.keras.layers.Conv2D(128, (4, 4), activation='relu'),
    tf.keras.layers.Dropout(DROPOUT_SIZE),
    tf.keras.layers.AveragePooling2D(),

    tf.keras.layers.Conv2D(128, (4, 4), activation='relu'),
    tf.keras.layers.Dropout(DROPOUT_SIZE),
    tf.keras.layers.AveragePooling2D(),

    tf.keras.layers.Conv2D(128, (4, 4), activation='relu'),
    tf.keras.layers.Dropout(DROPOUT_SIZE),
    tf.keras.layers.AveragePooling2D(),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),

    tf.keras.layers.Dense(256, activation=None), # No activation on embeding dense layer
    tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1)), # L2 normalize embeddings

    tf.keras.layers.Dense(714, activation='softmax')
    ])

    clf_model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['acc'])

    return clf_model

def get_triplet_model():
    clf_model = get_clf_model()

    triplet_model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer((32, 32, 1))
    ])

    for layer in clf_model.layers[:-1]:
        triplet_model.add(layer)

    triplet_model.compile(
        optimizer='adam',
        loss=tfa.losses.TripletSemiHardLoss())

    return triplet_model

In [None]:
# Embeddings

def return_mean_embeddings(row):
    array_embeddings = []
    for trace_index in range(7):
        array_embeddings.append(row["webgl_trace_{}".format(trace_index)])
    array_embeddings = np.mean(np.array(array_embeddings).reshape(7, 256), axis=0)
    
    return array_embeddings

# In the bogus dataset, the embeddings are already computed
if SHOULD_COMPUTE_EMBEDDINGS:
  SCALER_PATH = './wild_scaler_for_clf_model.pkl'
  WEIGHTS_PATH = './model_triplet_loss'

  file_nm = f'{WEIGHTS_PATH}/weights.ckpt'
  triplet_model = get_triplet_model()
  triplet_model.load_weights(file_nm)

  with open(SCALER_PATH, 'rb') as fd:
    scaler = pickle.load(fd)
  
  for i in range(7):
    df["webgl_trace_{}".format(trace_index)] = scaler.transform(np.array(df["webgl_trace_{}".format(trace_index)].to_list())).tolist()
    df["webgl_trace_{}".format(trace_index)] = df["webgl_trace_{}".format(trace_index)].apply(lambda x: np.array(x).reshape(32, 32, 1)) 
    output_network = triplet_model.predict(np.array(df["webgl_trace_{}".format(trace_index)].tolist()))
    df["webgl_trace_{}".format(trace_index)] = output_network.tolist()
  
  df["embeddings_webgl"] = df.apply(lambda row: return_mean_embeddings(row), axis=1)

In [None]:
###############################################################
###############################################################
############# EXECUTION #######################################
###############################################################

In [None]:
%%time
# Format df

columns = df.columns
lower_case = list(map(lambda x: x.lower(), columns))

rename_dict = dict()
for col, new_col in zip(columns, lower_case):
    rename_dict[col] = new_col
    
df = df.rename(columns=rename_dict)
formatted_df = dataset_to_dataframe(df)

In [None]:
%%time

if not IS_DEMO:
  consistent_uids = check_id_consistence(formatted_df)
else:
  consistent_uids = formatted_df['id'].unique()

In [None]:
formatted_df["cnt_uids"] = df.groupby('id')['collection_id'].transform('count')

if not IS_DEMO:
  filtered_df = formatted_df[formatted_df.cnt_uids > 6]
  filtered_df = filtered_df[filtered_df[Fingerprint.ID].isin(consistent_uids)]
  filtered_df = filtered_df.sort_values("counter")
else:
  filtered_df = formatted_df

In [None]:
#Training

if not IS_DEMO:
  train_df = filtered_df[filtered_df.creationdate < '2021-02-07']
  test_df = filtered_df[filtered_df.creationdate >= '2021-05-03']
else:
  train_df = filtered_df[filtered_df.creationdate < '2020-02-20']
  test_df = filtered_df[filtered_df.creationdate >= '2020-02-20']

train_df = train_df.sort_values(Fingerprint.CREATION_TIME)
test_df = test_df.sort_values(Fingerprint.CREATION_TIME)
test_df[Fingerprint.COUNTER] = test_df[Fingerprint.COUNTER].astype("int32")

In [None]:
%%time
model = train_ml(train_df, False, webgl_in_ml=False)

In [None]:
###### INFERENCE ######

In [None]:
train_df.shape[0], test_df.shape[0]

In [None]:
%%time
result = generate_replay_scenario(test_df, NB_DAYS, model, webgl_in_model=False, drawnapart=True)

In [None]:
evaluation_obj, chain_result, avg_chain_result, perfect_track = evaluate_sequence(result, test_df)
print("Chain result (max number of elements linked by one id)", json.dumps(chain_result, indent=4, sort_keys=True))

In [None]:
from matplotlib.pyplot import *
import matplotlib.pyplot as plt

def log_10_product(x, pos):
    """The two args are the value and tick position.
    Label ticks with the product of the exponentiation"""
    return '%1i' % (x)

def plot_and_save(chain_result, avg_chain_result, nb_days):
    stats_unicity = nb_days* np.array(sorted(chain_result.values()))
    p1 = 1. * np.arange(len(chain_result)) / (len(chain_result) - 1)
    p1 = 1- np.array(p1)

    stats_avg = nb_days* np.array(sorted(avg_chain_result.values()))
    p2 = 1. * np.arange(len(avg_chain_result)) / (len(avg_chain_result) - 1)
    p2 = 1- np.array(p2)


    median_max_track = np.median(nb_days * np.array(list(chain_result.values())))
    median_avg_track = np.median(nb_days * np.array(list(avg_chain_result.values())))
    avg_max_track = np.mean(nb_days * np.array(list(chain_result.values())))

    axes = plt.gca()
    l1 = plt.plot(stats_unicity, p1, color="green", label="No of unique rules")
    l2 = plt.plot(stats_avg, p2, color="red", label="No of unique rules")
    
    axes.grid(True)
    formatter = FuncFormatter(log_10_product)
    axes.xaxis.set_major_formatter(formatter)
    plt.title("FPStalker output (median (MAX) {} - median (AVG) {} - mean(AVG) {})".format(median_max_track, median_avg_track, avg_max_track))
    plt.xlabel('Tracking duration (collect frequency = {} days)'.format(nb_days), fontsize=14)
    plt.ylabel('Browser instances (%)', fontsize=14)
  
    plt.show()

plot_and_save(chain_result, avg_chain_result, NB_DAYS)

In [None]:
############################################################
############################################################
################## UTILS ###################################
############################################################
############################################################

In [None]:
from sklearn import metrics

def compute_distance_top_left(tpr, fp):
    return (0 - fp) * (0 - fp) + (1 - tpr) * (1 - tpr)

def optimize_lambda(train_data, test_data):
    X, y = [], []
    for visit_frequency in range(1, 10):
        print("Handling visit frequency {}/10".format(visit_frequency))
        train_replay_sequence = generate_replay_sequence_pd(train_data, visit_frequency)
        train_counters = list(map(lambda x: int(x[0].split('_')[0]), train_replay_sequence))
        
        filtered_train_data = train_data[train_data[Fingerprint.COUNTER].isin(train_counters)]
        
        list_uids = filtered_train_data.id.unique()
        
        for uid in list_uids:
            uid_train_data = filtered_train_data[filtered_train_data.id == uid]
            uid_train_data = uid_train_data.sort_values(Fingerprint.CREATION_TIME)
    
            for i in range(1, len(uid_train_data)):
                mod_attributes = MODEL_ATTRIBUTES
                fp1 = uid_train_data[mod_attributes].iloc[i]
                fp2 = uid_train_data[mod_attributes].iloc[i-1]
                X_row, y_row = compute_similarity_vector(fp1, fp2)
                X.append(X_row)
                y.append(y_row)
        
            for _, fp in uid_train_data.iterrows():
                mod_attributes = MODEL_ATTRIBUTES
                compared_uid = random_sample(list_uids, avoid=fp.id)
                compared_uid_df = filtered_train_data[filtered_train_data.id == compared_uid][mod_attributes]
                
                compared_fp = compared_uid_df.iloc[random_sample(np.array(range(len(compared_uid_df))))][mod_attributes]
                X_row, y_row = compute_similarity_vector(fp, compared_fp)
                X.append(X_row)
                y.append(y_row)
    
    model = RandomForestClassifier(n_jobs=-1)
    print("Training data: %d" % len(X))
    model.fit(X, y)
    print("Finished training")
    
    y_true = []
    y_scores = []
    for visit_frequency in range(1, 20):
        print("Handling visit frequency {}/20".format(visit_frequency))
        test_replay_sequence = generate_replay_sequence_pd(test_data, visit_frequency)
        test_counters = list(map(lambda x: int(x[0].split('_')[0]), test_replay_sequence))
        
        filtered_test_data = test_data[test_data[Fingerprint.COUNTER].isin(test_counters)]
        
        list_uids = filtered_test_data.id.unique()
        x_rows = []
        for uid in list_uids:
            uid_test_data = filtered_test_data[filtered_test_data.id == uid]
            uid_test_data = uid_test_data.sort_values(Fingerprint.CREATION_TIME)
    
            for i in range(1, len(uid_test_data)):
                mod_attributes = MODEL_ATTRIBUTES
                fp1 = uid_test_data[mod_attributes].iloc[i]
                fp2 = uid_test_data[mod_attributes].iloc[i-1]
                X_row, y_row = compute_similarity_vector(fp1, fp2)
                x_rows.append(X_row)
                y_true.append(1)
        
            for _, fp in uid_test_data.iterrows():
                mod_attributes = MODEL_ATTRIBUTES
                compared_uid = random_sample(list_uids, avoid=fp.id)
                compared_uid_df = filtered_test_data[filtered_test_data.id == compared_uid][mod_attributes]
                
                compared_fp = compared_uid_df.iloc[random_sample(np.array(range(len(compared_uid_df))))][mod_attributes]
                X_row, y_row = compute_similarity_vector(fp, compared_fp)
                x_rows.append(X_row)
                y_true.append(0)
                
        predictions = model.predict_proba(x_rows)
        for prediction in predictions:
            y_scores.append(prediction[1])
    
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_scores, pos_label=1)
    min_indice = 0
    min_distance = compute_distance_top_left(tpr[0], fpr[0])
    for i in range(1, len(fpr)):
        distance = compute_distance_top_left(tpr[i], fpr[i])
        if distance < min_distance:
            min_indice = i
            min_distance = distance
    print("best point")
    print("%f, %f, %f" % (fpr[min_indice], tpr[min_indice], thresholds[min_indice]))
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=15)
    plt.ylabel('True Positive Rate', fontsize=15)
    plt.savefig("./figures/lambda_optim.pdf")
    plt.show()

In [None]:
optimize_lambda(train_df, test_df)

In [None]:
###### Running as a loop

def run_loop(test_df, collect_freq=[], da=True):
    for freq in collect_freq:
        if da:
            exp_name = "collect_freq_with_da_3rd_may_{}_days_secondtrial".format(freq)
        else:
            exp_name = "collect_freq_without_da_3rd_may_{}_days_secondtrial".format(freq)
        print("Starting collection experiment with frequency {} days".format(freq))
        result_freq = generate_replay_scenario(test_df, freq, model, top_7_canvas=None, webgl=False, da=da,
                            save="./scenario_replay_result.csv")
        with open('outputs/fp_result/result_complete_{}.pickle'.format(exp_name), 'wb') as handle:
            pickle.dump(result_freq, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        evaluation_obj, chain_result, avg_chain_result, _ = evaluate_sequence(result_freq, test_df)
        with open('outputs/evaluation/evaluation_object_complete_{}.pickle'.format(exp_name), 'wb') as handle:
            pickle.dump(evaluation_obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open('outputs/evaluation/chain_result_complete_{}.pickle'.format(exp_name), 'wb') as handle:
            pickle.dump(chain_result, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        plot_and_save(chain_result, avg_chain_result, exp_name, freq)
            