In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

df = pd.read_csv('taxi_day.csv', 
    dtype={'taxi_id':int, 'date_time':object, 'location_id':int}, 
    parse_dates=['date_time'], 
    infer_datetime_format=True)
df.tail(5)


Unnamed: 0,date_time,taxi_id,location_id
1660683,2008-02-03 13:29:00,10322,560
1660684,2008-02-03 13:29:00,10333,666
1660685,2008-02-03 13:29:00,10336,274
1660686,2008-02-03 13:29:00,10341,387
1660687,2008-02-03 13:29:00,10344,366


In [2]:
from geopy.distance import geodesic

latitude_min = 39.81
longitude_min = 116.245
latitude_step = 0.0075 # 
longitude_step = 0.0100 # 
latitude_num = 30
longitude_num = 30

latitude_max = latitude_min + latitude_num*latitude_step
longitude_max = longitude_min + longitude_num*longitude_step

d = latitude_num*longitude_num
coordinate = np.zeros((d, 2))
for i in range(d):
    coordinate[i] = ((latitude_min + latitude_step*int(i/longitude_num) + latitude_step/2), (longitude_min + longitude_step*i%longitude_num + longitude_step/2))
dist_mat = np.zeros((d, d))
for i in range(d):
    for j in range(d):
        dist_mat[i][j] = geodesic(coordinate[i], coordinate[j]).m

# Budget Allocation

In [None]:
from tqdm import trange

w = 5
alpha = 1
exp_round = 100

class Window():
    def __init__(self, w, alpha):
        self.w = w
        self.alpha = alpha
        self.window = np.zeros(w)
        self.cwp = 0 # current window pointer

    def update(self, budget):
        self.window[self.cwp] = budget
        self.cwp = (self.cwp+1)%self.w

    def remain(self):
        return self.alpha-self.window.sum()+self.window[self.cwp]

def utility(freq, est_freq):
    k = 100
    # jaccard index
    true_top_k = np.argpartition(-freq, k)[:k]
    est_top_k = np.argpartition(-est_freq, k)[:k]
    jaccard_index = len(np.intersect1d(true_top_k, est_top_k))/len(np.union1d(true_top_k, est_top_k))
    # relative error
    freq /= freq.sum()
    est_freq /= est_freq.sum()
    relative_error = (abs(freq - est_freq)/(freq + np.finfo(np.float32).eps)).mean()
    return relative_error

def EM(data, alpha):
    global d
    global dist_mat
    p_mat = np.exp(-alpha*dist_mat/2)
    for i, _ in enumerate(p_mat):
        p_mat[i] /= p_mat[i].sum()
    universe = np.arange(d)
    aggregated_data = np.zeros(d)
    for item in data:
        aggregated_data[np.random.choice(universe, p=p_mat[item])] += 1
    return aggregated_data

## Baseline

In [None]:
def uniform(data_stream):
    global d
    global w
    global alpha
    global exp_round
    current_time = pd.Timestamp('2008-02-02 13:30:00')
    interval = pd.Timedelta('1min')
    util = []
    for i in trange(exp_round):
        current_time += interval
        data = data_stream.loc[data_stream['date_time']==current_time]['location_id'].values
        freq = np.zeros(d)
        for loc in data:
            freq[loc] += 1
        last_freq = EM(data, alpha/w)
        util.append(utility(freq, last_freq))
    return np.mean(util)

def sample(data_stream):
    global d
    global w
    global alpha
    global exp_round
    current_time = pd.Timestamp('2008-02-02 13:30:00')
    interval = pd.Timedelta('1min')
    util = []
    for i in trange(exp_round):
        current_time += interval
        data = data_stream.loc[data_stream['date_time']==current_time]['location_id'].values
        freq = np.zeros(d)
        for loc in data:
            freq[loc] += 1
        if i%w == 0:
            last_freq = EM(data, alpha)
        util.append(utility(freq, last_freq))
    return np.mean(util)

## Centered method

In [None]:
def serial_composition(t, data_stream, l=0.5):
    global d
    global w
    global alpha
    global exp_round
    current_time = pd.Timestamp('2008-02-02 13:30:00')
    interval = pd.Timedelta('1min')

    sim_alpha = l*alpha/w
    col_alpha = (1 - l)*alpha
    window = Window(w, col_alpha)

    data = data_stream.loc[data_stream['date_time']==current_time]['location_id'].values
    freq = np.zeros(d)
    for loc in data:
        freq[loc] += 1
    last_freq = EM(data, 2*col_alpha/w)
    window.update(2*col_alpha/w)
    util = []
    util.append(utility(freq, last_freq))
    for i in trange(1, exp_round):
        current_time += interval
        data = data_stream.loc[data_stream['date_time']==current_time]['location_id'].values
        freq = np.zeros(d)
        for loc in data:
            freq[loc] += 1
        sim_freq = EM(data, sim_alpha)
        distance = abs(sim_freq - last_freq).mean()
        if distance > t:
            rm = window.remain()//(2*col_alpha/w)
            if rm != 0:
                last_freq = EM(data, 2*col_alpha/w)
                window.update(2*col_alpha/w)
                last_freq = l*sim_freq + (1 - l)*last_freq
            else:
                last_freq = (1 - distance)*last_freq + distance*sim_freq
        # else:
        #     last_freq = (1 - distance)*last_freq + distance*sim_freq
        util.append(utility(freq, last_freq))
    return np.mean(util)

## Distributed method