In [1]:
import pandas as pd
import os

# --- Test Settings ---
GROUP_NUM = 3
assert GROUP_NUM >= 3 and GROUP_NUM // 2 == 1
CLIENTS_NUM_MAX_LIMIT = 100
CLIENTS_RANDOM_SELECT = False
ERROR_RATE = 0.0 # truth error rate
assert ERROR_RATE >= 0 and ERROR_RATE <= 1
INTERVAL = 0
assert INTERVAL >= 0 

# --- Path Settings ---
DIR_PREFIX = "./data/SF/"
CENTERS_LABELED_FILENAME = "centers_100_labeled.csv"
WAITING_POSITIONS_FILENAME = "waiting_positions.csv"
# EVENT_DIR_NAME = "event_SF_{0}".format(str(ERROR_RATE)[2:4])
EVENT_DIR_NAME = f"event_SF_{CLIENTS_NUM_MAX_LIMIT}"

In [2]:
centers = pd.read_csv(os.path.join(DIR_PREFIX, CENTERS_LABELED_FILENAME))
centers.head()

Unnamed: 0,cluster-id,latitude,longitude,latitude-std,longitude-std,report-num,range,traffic-light,item
0,3,37.779615,-122.427175,0.000528,0.000373,53,55.209462,1,
1,6,37.792272,-122.443238,7.1e-05,8.8e-05,47,9.621562,0,
2,7,37.738951,-122.414804,7e-05,0.000291,48,25.542693,0,
3,17,37.72427,-122.40203,0.00032,0.000264,39,35.40501,0,加油站
4,20,37.771823,-122.404737,0.00049,0.000528,45,61.505712,0,窄


In [3]:
groundtruth = centers[["cluster-id", "traffic-light"]].copy()
groundtruth.set_index("cluster-id", inplace=True)
groundtruth = groundtruth["traffic-light"]

In [4]:
waiting_positions = pd.read_csv(os.path.join(DIR_PREFIX, WAITING_POSITIONS_FILENAME))
waiting_positions.head()

Unnamed: 0,time,latitude,longitude,id,cluster-id
0,1211020534,37.77764,-122.43995,abmuyawm,0
1,1211020595,37.77763,-122.43996,abmuyawm,0
2,1211021066,37.77922,-122.43444,abmuyawm,1
3,1211022336,37.77364,-122.43808,abmuyawm,0
4,1211022569,37.77393,-122.43041,abmuyawm,0


In [5]:
# select * from waiting_positions
# where waiting_positions.cluster-id in set(centers.cluster-id)
cluster_ids = set(centers["cluster-id"].to_list())
waiting_positions = pd.concat([row for _, row in waiting_positions.iterrows() if row["cluster-id"] in cluster_ids], axis=1).T.copy()

ids = waiting_positions.groupby("id")["time"].count().sort_values(ascending=False)[:CLIENTS_NUM_MAX_LIMIT].index.to_list()
ids = set(ids)
# waiting_positions = pd.concat([row for _, row in waiting_positions.iterrows() if row["id"] in ids], axis=1).T.copy()
waiting_positions["id_filter"] = waiting_positions["id"].map(lambda id: id in ids)
waiting_positions = waiting_positions[waiting_positions["id_filter"]].copy()

times = waiting_positions.groupby("cluster-id")["time"].count()
qualified_cluster_ids = times[times >= 1].index.to_list()
waiting_positions = pd.concat([row for _, row in waiting_positions.iterrows() if row["cluster-id"] in qualified_cluster_ids], axis=1).T.copy()
waiting_positions.sort_values("cluster-id", inplace=True)
waiting_positions.describe().loc["unique", ["id", "cluster-id"]]

id             99
cluster-id    100
Name: unique, dtype: object

In [6]:
waiting_positions["identifier"] = waiting_positions["cluster-id"]
waiting_positions.head()

Unnamed: 0,time,latitude,longitude,id,cluster-id,id_filter,identifier
7,1211121763,37.78016,-122.42733,abmuyawm,3,True,3
54579,1212207204,37.78015,-122.42755,oljnek,3,True,3
32248,1211506432,37.7791,-122.42679,iacbyb,3,True,3
32249,1211506491,37.77909,-122.42681,iacbyb,3,True,3
32250,1211506668,37.77911,-122.42673,iacbyb,3,True,3


In [7]:
import random

def revert(claim):
    if claim == 1 or claim == 0:
        return (claim + 1) % 2
    else:
        return not claim

def bool_to_string(value):
    return "true" if value else "false"


events_path = os.path.join(DIR_PREFIX, EVENT_DIR_NAME)
if not os.path.exists(os.path.join(events_path)):
    os.makedirs(events_path)
else:
    for _, _, files in os.walk(events_path):
        for file in files:
            path = os.path.join(events_path, file)
            os.remove(path)

ids = waiting_positions["id"].drop_duplicates().to_list()
for id in ids:
    events = waiting_positions[["id", "identifier"]].copy()
    events.loc[events["id"] == id, "owned_"] = True
    events.loc[events["id"] != id, "owned_"] = False
    del events["id"]

    events["claim"] = events["identifier"].map(
        lambda identifier: groundtruth[identifier] if random.random() >= ERROR_RATE else revert(groundtruth[identifier]))

    events = events.groupby("identifier").aggregate(any)
    events["claim"] = events["claim"].map(bool_to_string)
    events["owned"] = events["owned_"].map(bool_to_string)
    del events["owned_"]

    # events.drop_duplicates(inplace=True)
    events["delay_seconds"] = INTERVAL
    events.to_csv(os.path.join(DIR_PREFIX, EVENT_DIR_NAME, "{}.csv".format(id)))