In [4]:
import pandas as pd
import numpy as np
import math
from operator import itemgetter
from tqdm import tqdm_notebook, tqdm
tqdm.pandas()
np.random.seed(2)

In [6]:
csv_location = "./final_data.csv"
df = pd.read_csv(csv_location)
df['TIMESTAMP'] = pd.to_datetime(df.TIMESTAMP, unit='s')

In [7]:
df[:10]

Unnamed: 0,TAXI_ID,TIMESTAMP,CAMERA_ID
0,0,2013-08-13 20:00:18,0
1,0,2013-08-13 20:00:33,0
2,0,2013-08-13 20:03:03,1
3,0,2013-08-13 20:03:18,1
4,0,2013-08-13 20:03:33,2
5,0,2013-08-13 20:03:48,2
6,0,2013-08-13 20:04:03,3
7,0,2013-08-13 20:04:18,3
8,0,2013-08-13 20:04:33,3
9,0,2013-08-13 20:04:48,3


In [8]:
HEAD_LIMIT = 10 # for quick testing... -1 to look at all data.. 10 to look at first 10 rows
chunk_size = 15 # only seconds!
max_row = 3

# based on 99th percentile! these are in terms of seconds
rhos = {0: 135, 1: 90, 2: 90, 3: 240, 4: 210, 5: 120, 6: 150, 7: 180, 8: 300, 9: 90, 10: 45, 11: 255, 12: 150, 13: 180, 14: 165, 15: 150, 16: 165, 17: 210, 18: 165, 19: 210, 20: 135, 21: 150, 22: 210, 23: 120, 24: 90, 25: 180, 26: 135, 27: 195, 28: 195, 29: 300, 30: 155, 31: 135, 32: 75, 33: 120, 34: 120, 35: 90, 36: 135, 37: 75, 38: 90, 39: 150, 40: 60, 41: 90, 42: 210, 43: 210, 44: 195, 45: 165, 46: 60, 47: 105, 48: 90, 49: 135, 50: 75, 51: 135, 52: 150, 53: 180, 54: 195, 55: 210, 56: 90, 57: 90, 58: 15, 59: 15, 60: 45, 61: 120, 62: 225, 63: 195, 64: 105, 65: 210, 66: 135, 67: 195, 68: 315, 69: 135, 70: 45, 71: 323, 72: 315, 73: 450, 74: 210, 75: 225, 76: 135, 77: 180, 78: 120, 79: 255, 80: 315, 81: 105, 82: 240, 83: 525, 84: 165, 85: 276, 86: 210, 87: 210, 88: 150, 89: 483, 90: 135, 91: 105, 92: 135, 93: 180, 94: 225, 95: 210, 96: 15, 97: 180, 98: 212, 99: 247, 100: 210, 101: 97, 102: 135, 103: 285, 104: 138}

max_num_unique_taxis = 300
num_days = None # set later in the code

In [10]:
def get_table_for_camera(CAMERA_ID, privid=True):
    if not privid:
        cam_df = df[df['CAMERA_ID'] == CAMERA_ID][["TAXI_ID", "TIMESTAMP"]]
        temp_cam_df = cam_df.sort_values("TIMESTAMP")[:HEAD_LIMIT] 
        return temp_cam_df.copy()
    cam_df = df[df['CAMERA_ID'] == CAMERA_ID][["TAXI_ID", "TIMESTAMP"]]
    temp_cam_df = cam_df.sort_values("TIMESTAMP")[:HEAD_LIMIT] 
    freq = str(chunk_size) + "s"
    f = temp_cam_df.groupby(pd.Grouper(key="TIMESTAMP", freq=freq))['TAXI_ID'].progress_apply(lambda e : list(np.unique(e))[:max_row])
    g = f.explode()
    y = g[g.notnull()]
    y = y.reset_index()
    return y.copy()

def get_durs(y, upper, privid=True):
    y = y.drop_duplicates(["TIMESTAMP", "TAXI_ID"])
    z = y.groupby([y['TIMESTAMP'].dt.date, "TAXI_ID"])["TIMESTAMP"].agg([np.min, np.max])
    z['dur'] = z['amax'] - z['amin']
    z['dur'] = z['dur'].apply(lambda x : x.total_seconds())
    n = z.reset_index()
    if privid:
        avg_per_day = (n.groupby("TIMESTAMP")["dur"].sum()/max_num_unique_taxis).clip(upper=upper*3600)
        avg_avg = avg_per_day.sum()/num_days/3600
    else:
        avg_per_day = n.groupby("TIMESTAMP")["dur"].mean()
        avg_avg = avg_per_day.mean()/3600
    return avg_avg

def get_taxi_counts_per_day(y, privid=True):
    temp = y.groupby('date')["TAXI_ID"].nunique()
    if privid:
        avg = np.sum(temp)/num_days
    else:
        avg = np.mean(temp)
    return avg

def get_per_table_sensitivity(rho):
    return (math.ceil(rho/chunk_size)+1) * max_row

def prep_table_for_taxi_counts(h):
    h['date'] = h["TIMESTAMP"].dt.date
    h = h.drop_duplicates(["date", "TAXI_ID"])
    h = h.drop(columns=["TIMESTAMP"])
    return h
    
def get_union_table_sensitivity(s1, s2):
    return s1 + s2

def get_intersection_table_sensitivity(s1, s2):
    return s1 + s2

# def get_noise_scale(tot_sensitivity, upper, denom):
#     return (tot_sensitivity * upper)/denom

def get_acc_and_std(truth, privid_prenoise, noise_scale, num_samples=1000):
    samples = []
    for i in range(num_samples):
        noise = np.random.laplace(loc=0, scale=noise_scale, size=1)[0]
        samples.append((1 - abs(privid_prenoise+noise-truth) / truth) * 100)
    print("%.2f%% +/- %.2f%%" % (np.mean(samples), np.std(samples)))

In [20]:
cam27table = get_table_for_camera(27)
cam10table = get_table_for_camera(10)
gt_cam27table = get_table_for_camera(27, privid=False)
gt_cam10table = get_table_for_camera(10, privid=False)

# union em!
merged = pd.concat([cam27table, cam10table])
gt_merged = pd.concat([gt_cam27table, gt_cam10table])

num_days = (merged["TIMESTAMP"].dt.date.max()-merged["TIMESTAMP"].dt.date.min()).days + 1

100%|██████████| 429/429 [00:00<00:00, 10808.63it/s]
100%|██████████| 1545/1545 [00:00<00:00, 10096.32it/s]


In [12]:
cam27table

Unnamed: 0,TIMESTAMP,TAXI_ID
0,2013-07-01 00:14:15,257
1,2013-07-01 00:14:30,257
2,2013-07-01 00:28:00,257
3,2013-07-01 00:28:30,257
4,2013-07-01 00:28:45,257
5,2013-07-01 00:29:00,257
6,2013-07-01 00:29:15,257
7,2013-07-01 02:00:45,158
8,2013-07-01 02:01:00,158
9,2013-07-01 02:01:15,158


In [29]:
union_rho_sum

54

In [13]:
# Avg Working Hours of Taxi... Via Union of 2 Cameras

upper = 16

avg_duration = get_durs(merged, upper)
avg_duration_gt = get_durs(merged, None, privid=False)
rho1 = get_per_table_sensitivity(rhos[27])
rho2 = get_per_table_sensitivity(rhos[10])
union_rho_sum = get_union_table_sensitivity(rho1, rho2)
noise_scale = (union_rho_sum * upper)/(max_num_unique_taxis * num_days)
noise = round(np.random.laplace(loc=0, scale=noise_scale, size=1)[0], 5) # todo: set seed

  avg_avg = avg_per_day.sum()/num_days/3600


ZeroDivisionError: division by zero

In [119]:
avg_duration_gt, avg_duration, noise, round(noise_scale, 5), num_days

(5.862617862371253, 6.206143734737484, -0.00108, 0.00791, 364)

In [120]:
# min(avg_duration+noise, avg_duration_gt)/max(avg_duration+noise, avg_duration_gt)
get_acc_and_std(avg_duration_gt, avg_duration, noise_scale)
# 94.14% +/- 0.18%

94.14% +/- 0.18%


In [123]:
# Avg Num Taxis Present in Both Cameras Each

table1 = prep_table_for_taxi_counts(cam27table.copy())
table2 = prep_table_for_taxi_counts(cam10table.copy())

gt_table1 = prep_table_for_taxi_counts(gt_cam27table.copy())
gt_table2 = prep_table_for_taxi_counts(gt_cam10table.copy())

# intersect
y = pd.merge(table1,table2,on=['TAXI_ID', 'date'])
gt_y = pd.merge(gt_table1,gt_table2,on=['TAXI_ID', 'date'])

avg_count = get_taxi_counts_per_day(y)
gt_avg_count = get_taxi_counts_per_day(gt_y, False)

rho1 = get_per_table_sensitivity(rhos[27])
rho2 = get_per_table_sensitivity(rhos[10])
intersect_rho_sum = get_intersection_table_sensitivity(rho1, rho2)
noise_scale = intersect_rho_sum * 1 / num_days
noise = round(np.random.laplace(loc=0, scale=noise_scale, size=1)[0], 5) # todo: set seed

In [124]:
gt_avg_count, avg_count, noise, round(noise_scale, 5), num_days

(131.25753424657535, 131.48901098901098, -0.09335, 0.14835, 364)

In [125]:
# min(avg_count+noise, gt_avg_count)/max(avg_count+noise, gt_avg_count)
get_acc_and_std(gt_avg_count, avg_count, noise_scale)
# 99.80% +/- 0.13%

99.80% +/- 0.13%


In [127]:
# arxmax query
results = dict()
for CAMERA_ID in tqdm(df['CAMERA_ID'].unique()):
    cam_df = df[df['CAMERA_ID'] == CAMERA_ID][["TAXI_ID", "TIMESTAMP"]].copy()
    temp_cam_df = cam_df.sort_values("TIMESTAMP")[:HEAD_LIMIT] 
    temp_cam_df['date'] = temp_cam_df['TIMESTAMP'].dt.date
    temp_cam_df = temp_cam_df.drop(columns=['TIMESTAMP'])
    temp_cam_df = temp_cam_df.drop_duplicates(['TAXI_ID', 'date'])
    avg = np.mean(np.array(temp_cam_df.groupby(["date"]).nunique()['TAXI_ID']))
    results[CAMERA_ID] = int(avg)
chunk_size=15
final = dict()
for k, v in gt.items():
    ns = (math.ceil(rhos[k]/chunk_size) + 1)*max_row/365
    final[k] = int(v +  np.random.laplace(loc=0, scale=ns, size=1)[0])
print(sorted(final.items(), key=itemgetter(1))[::-1][:5])
print(sorted(gt.items(), key=itemgetter(1))[::-1][:5])

100%|██████████| 105/105 [00:16<00:00,  6.24it/s]


[(0, 254), (20, 253), (27, 239), (11, 215), (6, 212)]
[(20, 254), (0, 254), (27, 239), (11, 216), (6, 213)]


In [128]:
(df['TIMESTAMP'].max() - df['TIMESTAMP'].min())

Timedelta('364 days 23:57:57')

In [30]:
(math.ceil(rhos[k]/chunk_size) + 1)*max_row/365

NameError: name 'k' is not defined