In [1]:
import pandas as pd

df = pd.read_csv('xy_day.csv', 
    dtype={'taxi_id':int, 'location_id':int, 'date_time':object, 'x':float, 'y':float}, 
    parse_dates=['date_time'], 
    infer_datetime_format=True)
loc = pd.read_csv('xy_loc.csv')
def mean_absolute_error(freq, est_freq):
    return (abs(freq - est_freq)).mean()
def mean_relative_error(freq, est_freq):
    freq /= freq.sum()
    est_freq /= est_freq.sum()
    return (abs(freq - est_freq)/(freq + np.finfo(np.float32).eps)).mean()
def jaccard_index(freq, est_freq, k):
    top_k = np.argpartition(-freq, k)[:k]
    est_top_k = np.argpartition(-est_freq, k)[:k]
    return len(np.intersect1d(top_k, est_top_k))/len(np.union1d(top_k, est_top_k))
def MSE(freq, est_freq):
    return (np.square(freq - est_freq)).mean()

import numpy as np
from math import exp

from tqdm import trange

n = 10357
episode = 1440
d = 900
universe = np.arange(d)

w = 30
epsilon = 1

In [3]:
for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    window = np.zeros((n, episode), dtype=float)
    # current window pointer
    cwp = 0
    # historical data
    storage = [[] for _ in range(n)]

    current_time = pd.Timestamp('2008-02-02 13:30:00')
    interval = pd.Timedelta('1min')

    util = []
    for t in trange(episode):
        data = df.loc[df['date_time']==current_time][['location_id', 'taxi_id', 'x', 'y']]
        private_freq = np.zeros(d)
        true_freq = np.zeros(d)
        for row in data.itertuples():
            true_freq[row.location_id] += 1
            # randomization
            release_budget = epsilon
            remain = epsilon - window[row.taxi_id-1][max(0, cwp-w+1):cwp].sum()
            if remain < release_budget:
                release_budget = 0
            if release_budget:
                exp_prob = np.sqrt(np.square(row.x - loc['x'].to_numpy()) + np.square(row.y - loc['y'].to_numpy()))
                exp_prob = np.exp(-release_budget*exp_prob/2)
                exp_prob /= exp_prob.sum()
                private_loc = np.random.choice(universe, p=exp_prob)
                # update window
                window[row.taxi_id-1][cwp] = release_budget
                storage[row.taxi_id-1].append(private_loc)
            else:
                private_loc = storage[row.taxi_id-1][len(storage[row.taxi_id-1])-1]
            private_freq[private_loc] += 1
        cwp += 1
        current_time += interval
        # skip empty timestamp
        if np.all(true_freq == 0):
            continue
        # util.append(mean_relative_error(true_freq, private_freq))
        # util.append(jaccard_index(true_freq, private_freq, 50))
        util.append(MSE(true_freq, private_freq))
    print(np.mean(util))

100%|██████████| 1440/1440 [00:31<00:00, 45.87it/s]


1.761446009389671


100%|██████████| 1440/1440 [00:30<00:00, 46.99it/s]


1.7451721439749608


100%|██████████| 1440/1440 [00:30<00:00, 47.00it/s]


1.7406823161189358


100%|██████████| 1440/1440 [00:30<00:00, 47.13it/s]


1.7377261345852897


100%|██████████| 1440/1440 [00:30<00:00, 47.15it/s]


1.7367949921752739


100%|██████████| 1440/1440 [00:30<00:00, 47.53it/s]


1.7356306729264477


100%|██████████| 1440/1440 [00:30<00:00, 47.68it/s]


1.7344225352112677


100%|██████████| 1440/1440 [00:30<00:00, 47.19it/s]


1.734793427230047


100%|██████████| 1440/1440 [00:30<00:00, 47.05it/s]


1.7341455399061032


100%|██████████| 1440/1440 [00:30<00:00, 47.03it/s]

1.734039123630673



