In [1]:
import pandas as pd

df = pd.read_csv('confirm_7d250.csv', 
    dtype={'id':int, 'Date':object, 'inc':int}, 
    parse_dates=['Date'], 
    infer_datetime_format=True)
df.head(5)

Unnamed: 0,Date,id,inc,inc_id
0,2020-01-23,109,1,0
1,2020-01-23,214,1,0
2,2020-01-23,225,1,0
3,2020-01-23,638,1,0
4,2020-01-23,1275,1,0


In [8]:
def MAE(freq, est_freq):
    return (abs(freq - est_freq)).mean()
def MRE(freq, est_freq):
    return (abs(np.dot(freq, np.arange(d)) - np.dot(est_freq, np.arange(d))))/(np.dot(freq, np.arange(d)) + np.finfo(np.float32).eps)
def jaccard_index(freq, est_freq, k):
    top_k = np.argpartition(-freq, k)[:k]
    est_top_k = np.argpartition(-est_freq, k)[:k]
    return len(np.intersect1d(top_k, est_top_k))/len(np.union1d(top_k, est_top_k))
def MSE(freq, est_freq):
    return (np.square(freq - est_freq)).mean()
def RMSE(freq, est_freq):
    return np.sqrt((np.square(freq - est_freq)).mean())

import numpy as np
from math import exp

from tqdm import trange

n = 3336
episode = 87
d = 100
universe = np.arange(d)

p_mat = np.zeros((d, d))
for i in range(d):
    p_mat[i] = np.abs(universe[i] - universe)*100

w = 87
epsilon_range = np.arange(0.1, 1.1, 0.1)
# [0.04931640625, 
#                 1.8316650390625, 
#                 9.9993896484375, 
#                 14.99908447265625, 
#                 19.9993896484375, 
#                 24.999237060546875, 
#                 29.99908447265625, 
#                 34.99946594238281, 
#                 39.9993896484375, 
#                 44.99931335449219, 
#                 49.999237060546875]
interval = pd.Timedelta('7d')
start_time = pd.Timestamp('2020-01-23')

## Uniform

In [9]:
for epsilon in epsilon_range:
    current_time = start_time

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
            # randomization
            release_budget = epsilon/w
            exp_prob = np.exp(-release_budget*p_mat[row.inc_id]/2)
            exp_prob /= exp_prob.sum()
            private_inc = np.random.choice(universe, p=exp_prob)
            private_freq[private_inc] += 1
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(jaccard_index(true_freq, private_freq, 20))
    print(np.mean(util))

0.7160156930556214
0.7330024164210014
0.7357455411031565
0.7469190690251739
0.7334168379033045
0.7423359943169004
0.7360378566775369
0.7362325802355788
0.7478073890867494
0.7598065916906496


## Local Sample

In [10]:
for epsilon in epsilon_range:
    window = np.zeros((n, episode), dtype=float)
    # historical data
    storage = [[] for _ in range(n)]

    current_time = start_time

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
            release_budget = epsilon
            remain = epsilon - window[row.id][max(0, t-w+1):t].sum()
            if remain < release_budget:
                release_budget = 0
            if release_budget > 0:
                # randomization
                exp_prob = np.exp(-release_budget*p_mat[row.inc_id]/2)
                exp_prob /= exp_prob.sum()
                private_inc = np.random.choice(universe, p=exp_prob)
                # update window
                window[row.id][t] = release_budget
                storage[row.id].append(private_inc)
            else:
                private_inc = storage[row.id][len(storage[row.id])-1]
            private_freq[private_inc] += 1
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(jaccard_index(true_freq, private_freq, 20))
    print(np.mean(util))

0.281149171615703
0.2799731108047538
0.2799731108047538
0.2799731108047538
0.2799731108047538
0.2799731108047538
0.2799731108047538
0.2799731108047538
0.2799731108047538
0.2799731108047538


## Fix-Alloc

In [11]:
for epsilon in epsilon_range:
    window = np.zeros((n, episode), dtype=float)
    # historical data
    storage = [[] for _ in range(n)]

    current_time = start_time
    ratio = 0.55

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
            # release_flag = np.random.binomial(1, ratio)
            release_budget = epsilon/(w*ratio)
            remain = epsilon - window[row.id][max(0, t-w+1):t].sum()
            if (remain < release_budget): # or (release_flag == False):
                release_budget = 0
            if release_budget > 0:
                # randomization
                exp_prob = np.exp(-release_budget*p_mat[row.inc_id]/2)
                exp_prob /= exp_prob.sum()
                private_inc = np.random.choice(universe, p=exp_prob)
                # update window
                window[row.id][t] = release_budget
                storage[row.id].append(private_inc)
            else:
                private_inc = storage[row.id][len(storage[row.id])-1]
            private_freq[private_inc] += 1
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(jaccard_index(true_freq, private_freq, 20))
    print(np.mean(util))

0.7275487330062985
0.7390678467674453
0.7368528904398066
0.7467780928050792
0.7345541214641148
0.745498975322773
0.7666427081719435
0.7660586082409653
0.7120033796695465
0.7777727926653462


## Ada-Alloc

In [12]:
for epsilon in epsilon_range:
    window = np.zeros((n, episode), dtype=float)
    # historical data
    storage = [[] for _ in range(n)]

    current_time = start_time
    ratio = np.full(n, 0.55)
    data_flag = np.zeros((n, episode), dtype=int)
    release_flag = np.zeros((n, episode), dtype=int)
    pid_error = np.zeros((n, episode), dtype=float)

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
            # release_flag = np.random.binomial(1, ratio)
            release_budget = epsilon/(w*ratio[row.id])
            remain = epsilon - window[row.id][max(0, t-w+1):t].sum()
            if (remain < release_budget): # or (release_flag == False):
                release_budget = 0
            if release_budget > 0:
                # randomization
                exp_prob = np.exp(-release_budget*p_mat[row.inc_id]/2)
                exp_prob /= exp_prob.sum()
                private_inc = np.random.choice(universe, p=exp_prob)
                # update window
                window[row.id][t] = release_budget
                storage[row.id].append(private_inc)
            else:
                private_inc = storage[row.id][len(storage[row.id])-1]
            private_freq[private_inc] += 1
        if t > w:
            data_num = data_flag[:, max(0, t-w+1):t].sum(axis=1)
            release_num = release_flag[:, max(0, t-w+1):t].sum(axis=1)
            pid_error[:, t] = abs((data_num - release_num - 0.01)/w)
            error = np.sign((data_num - release_num - 0.01))*(1.0*pid_error[:, t] + 0.1*pid_error[:, max(0, t-w+1):t].sum(axis=1) + 0.2*(pid_error[:, t] - pid_error[:, t-1]))
            ratio += 1*error
            ratio = np.clip(ratio, 1/w, 1)
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(jaccard_index(true_freq, private_freq, 20))
    print(np.mean(util))

0.7363464630468552
0.7351963818265151
0.7419808089929092
0.7423918054637179
0.7521480551000275
0.7506045068263958
0.7521426063155199
0.7393485704330283
0.753973408749469
0.788389037064699


## Local BD

In [13]:
for epsilon in epsilon_range:
    window = np.zeros((n, episode), dtype=float)
    # historical data
    storage = [[] for _ in range(n)]

    current_time = start_time

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
            # decision
            if storage[row.id]:
                last_release = storage[row.id][len(storage[row.id]) - 1]
                distance = abs(row.inc_id - last_release)
                decision_budget = epsilon/(2*w)
                # add noise to distance
                private_distance = np.random.laplace(0, 1/decision_budget)
                remain = epsilon/2 - window[row.id][max(0, t-w+1):t].sum()
                release_budget = remain/2
                if private_distance > 1/release_budget:
                    # randomization
                    exp_prob = np.exp(-release_budget*p_mat[row.inc_id]/2)
                    exp_prob /= exp_prob.sum()
                    private_inc = np.random.choice(universe, p=exp_prob)
                    # update window
                    window[row.id][t] = release_budget
                    storage[row.id].append(private_inc)
                else:
                    private_inc = storage[row.id][len(storage[row.id])-1]
            # if never released
            else:
                # randomization
                release_budget = epsilon/(2*w)
                exp_prob = np.abs(row.inc_id - universe)
                exp_prob = np.exp(-release_budget*exp_prob/2)
                exp_prob /= exp_prob.sum()
                private_inc = np.random.choice(universe, p=exp_prob)
                # update window
                window[row.id][t] = release_budget
                storage[row.id].append(private_inc)
            private_freq[private_inc] += 1
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(jaccard_index(true_freq, private_freq, 20))
    print(np.mean(util))

0.7032967107248933
0.6924738293690914
0.7161114165680849
0.7185525716613115
0.7165050614805815
0.7122960752180626
0.7205065805505625
0.7169801461327279
0.7130713318256972
0.721631664976334


## Local BA

In [14]:
for epsilon in epsilon_range:
    window = np.zeros((n, episode), dtype=float)
    # historical data
    storage = [[] for _ in range(n)]

    current_time = start_time

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
            # decision
            if storage[row.id]:
                last_release = storage[row.id][len(storage[row.id]) - 1]
                distance = abs(row.inc_id - last_release)
                decision_budget = epsilon/(2*w)
                # add noise to distance
                private_distance = np.random.laplace(0, 1/decision_budget)
                remain = epsilon/2 - window[row.id][max(0, t-w+1):t].sum()
                release_budget = remain
                if release_budget > 0 and private_distance > 1/release_budget:
                    # randomization
                    exp_prob = np.exp(-release_budget*p_mat[row.inc_id]/2)
                    exp_prob /= exp_prob.sum()
                    private_inc = np.random.choice(universe, p=exp_prob)
                    # update window
                    window[row.id][t] = release_budget
                    storage[row.id].append(private_inc)
                else:
                    private_inc = storage[row.id][len(storage[row.id])-1]
            # if never released
            else:
                # randomization
                release_budget = epsilon/(2*w)
                exp_prob = np.abs(row.inc_id - universe)
                exp_prob = np.exp(-release_budget*exp_prob/2)
                exp_prob /= exp_prob.sum()
                private_inc = np.random.choice(universe, p=exp_prob)
                # update window
                window[row.id][t] = release_budget
                storage[row.id].append(private_inc)
            private_freq[private_inc] += 1
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(jaccard_index(true_freq, private_freq, 20))
    print(np.mean(util))

0.4418851828368525
0.4292027664176963
0.49740763494284407
0.5111684646795062
0.5543096914220357
0.4646962039951863
0.5034543485230595
0.5104367106165756
0.5025133153809689
0.4638350249320625


## Local-DSFT

In [193]:
for epsilon in epsilon_range:
    threshold = 10
    ratio = 0.5
    window = np.zeros((n, episode), dtype=float)
    # historical data
    storage = [[] for _ in range(n)]

    current_time = start_time

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
            # decision
            if storage[row.id]:
                last_release = storage[row.id][len(storage[row.id]) - 1]
                distance = abs(row.inc_id - last_release)
                decision_budget = epsilon/(2*w)
                # add noise to distance
                private_distance = np.random.laplace(0, 1/decision_budget)
                remain = epsilon/2 - window[row.id][max(0, t-w+1):t].sum()
                release_budget = epsilon/(2*w*ratio)
                if remain > 0 and remain > release_budget and private_distance > threshold:
                    # randomization
                    exp_prob = np.abs(row.inc_id - universe)
                    exp_prob = np.exp(-release_budget*exp_prob/2)
                    exp_prob /= exp_prob.sum()
                    private_inc = np.random.choice(universe, p=exp_prob)
                    # update window
                    window[row.id][t] = release_budget
                    storage[row.id].append(private_inc)
                else:
                    private_inc = storage[row.id][len(storage[row.id])-1]
            # if never released
            else:
                # randomization
                release_budget = epsilon/(2*w)
                exp_prob = np.abs(row.inc_id - universe)
                exp_prob = np.exp(-release_budget*exp_prob/2)
                exp_prob /= exp_prob.sum()
                private_inc = np.random.choice(universe, p=exp_prob)
                # update window
                window[row.id][t] = release_budget
                storage[row.id].append(private_inc)
            private_freq[private_inc] += 1
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(MSE(true_freq, private_freq))
    print(np.mean(util))

58234.320689655186
58197.58252873563
58025.62298850576
57952.11862068966
57770.80551724138
57663.61931034484
57653.30206896552
57478.08160919541
57259.97586206897
57142.65425287357


In [9]:
from xxhash import xxh32

def OLH(data, epsilon):
    global d
    g = int(round(exp(epsilon))) + 1
    p = exp(epsilon)/(exp(epsilon) + g - 1)
    q = 1/(exp(epsilon) + g - 1)
    aggregated_data = np.zeros(d)
    for i, item in enumerate(data):
        private_data = (xxh32(str(item), seed=i).intdigest() % g)
        if np.random.random_sample() > p - q:
            private_data = np.random.randint(0, g)
        for j in range(d):
            if private_data == (xxh32(str(j), seed=i).intdigest() % g):
                aggregated_data[j] +=  1
    return (g*aggregated_data - len(data))/(p*g - 1)

for epsilon in epsilon_range:
    current_time = start_time

    util = []
    for t in range(episode):
        data = df.loc[df['Date']==current_time]
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.inc_id] += 1
        private_freq = OLH(data['inc_id'].to_numpy(), 2.8878509998321533)
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        util.append(RMSE(true_freq, private_freq))
    print(np.mean(util))

25.1315464937389


KeyboardInterrupt: 

In [149]:
true_freq

array([145., 145., 117., 122.,  96., 106., 104.,  90.,  83.,  99.,  78.,
        61.,  65.,  57.,  67.,  51.,  54.,  46.,  47.,  49.,  49.,  36.,
        36.,  29.,  34.,  32.,  31.,  37.,  33.,  20.,  27.,  21.,  24.,
        20.,  25.,  22.,  17.,  23.,  24.,  18.,  13.,  16.,  17.,  12.,
        12.,  21.,  15.,  18.,   8.,  18.,  14.,  13.,   7.,  17.,   8.,
         7.,  16.,  13.,  15.,  14.,   5.,  12.,  11.,  12.,   7.,   9.,
         3.,   5.,   4.,   6.,   7.,   8.,   8.,   4.,   8.,   7.,   5.,
        10.,   6.,   3.,   4.,   3.,   4.,   7.,   6.,   6.,   6.,   3.,
         7.,   2.,   6.,   4.,   6.,   4.,   5.,   5.,   4.,   3.,   1.,
       382.])

In [306]:
window[1]

array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. , 9.9, 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])