In [1]:
import pandas as pd
import os

# --- Path Settings ---
DIR_PREFIX = "./data/SF/"
CENTERS_LABELED_FILENAME = "centers_100_labeled.csv"
WAITING_POSITIONS_FILENAME = "waiting_positions.csv"
ERROR_RATE_STATISTICS_FILENAME = "error_rates.csv"

# --- Test Settings ---
GROUP_NUM = 3
CLIENTS_NUM_MAX_LIMIT = 15
DROP_DUPLICATE_REPORTS = False
assert GROUP_NUM * 2 < CLIENTS_NUM_MAX_LIMIT

In [2]:
centers = pd.read_csv(os.path.join(DIR_PREFIX, CENTERS_LABELED_FILENAME), index_col="cluster-id")
centers = centers["traffic-light"]
centers.describe()

count    100.00
mean       0.55
std        0.50
min        0.00
25%        0.00
50%        1.00
75%        1.00
max        1.00
Name: traffic-light, dtype: float64

In [3]:
waiting_positions = pd.read_csv(os.path.join(DIR_PREFIX, WAITING_POSITIONS_FILENAME))
waiting_positions.head()

Unnamed: 0,time,latitude,longitude,id,cluster-id
0,1211020534,37.77764,-122.43995,abmuyawm,0
1,1211020595,37.77763,-122.43996,abmuyawm,0
2,1211021066,37.77922,-122.43444,abmuyawm,1
3,1211022336,37.77364,-122.43808,abmuyawm,0
4,1211022569,37.77393,-122.43041,abmuyawm,0


In [4]:
waiting_positions["traffic-light"] = -1
for cluster_id in centers.index.to_list():
    waiting_positions.loc[waiting_positions["cluster-id"] == cluster_id, "traffic-light"] = centers[cluster_id]

waiting_positions = waiting_positions[waiting_positions["traffic-light"] >= 0]
waiting_positions.describe()

Unnamed: 0,time,latitude,longitude,cluster-id,traffic-light
count,7169.0,7169.0,7169.0,7169.0,7169.0
mean,1212012000.0,37.761167,-122.435798,282.382062,0.617938
std,588492.2,0.021653,0.02707,286.447924,0.485925
min,1211020000.0,37.7014,-122.49997,3.0,0.0
25%,1211512000.0,37.7485,-122.45592,63.0,0.0
50%,1211999000.0,37.76606,-122.43011,172.0,1.0
75%,1212516000.0,37.77918,-122.41477,402.0,1.0
max,1213086000.0,37.79741,-122.4,1409.0,1.0


In [5]:
waiting_positions = waiting_positions[["id", "traffic-light"]]
if DROP_DUPLICATE_REPORTS:
    waiting_positions.drop_duplicates(inplace=True)

In [6]:
true_detections_n = waiting_positions.groupby("id")["traffic-light"].sum()
total_detections_n = waiting_positions.groupby("id")["traffic-light"].count()
total_detections_n.name = "total-times"
statistics = pd.concat([true_detections_n, total_detections_n], axis=1)
statistics["error_rate"] = 1.0 - (statistics["traffic-light"] / statistics["total-times"])
statistics.describe()

Unnamed: 0,traffic-light,total-times,error-rate
count,99.0,99.0,99.0
mean,44.747475,72.414141,0.356746
std,36.691306,48.556723,0.191069
min,2.0,4.0,0.0
25%,26.0,43.5,0.209914
50%,40.0,64.0,0.326087
75%,55.0,88.5,0.467967
max,325.0,333.0,0.913978


In [7]:
statistics.to_csv(os.path.join(DIR_PREFIX, "error_rates.csv"), columns=["error-rate"])