# Read Dataset

In [1]:
SF_DIR_PATH = './data/cabspottingdata'
FILE_PREFIX = "new_"

DIR_PREFIX = "./data/SF/"
CENTERS_LABELED_FILENAME = "centers_100_labeled.csv"
WAITING_POSITIONS_FILENAME = "waiting_positions.csv"
ERROR_RATES_FILENAME = 'error_rates_for_each_client.csv'

In [3]:
import pandas as pd
from tqdm.notebook import tqdm
import os

# Bypass Counter

In [3]:
taxi_dataset = dict()

for _root, _dirs, files in os.walk(SF_DIR_PATH):
    for filename in files:
        # pass the none-data file
        if not filename.startswith(FILE_PREFIX):
            continue
        
        # extract data from csv file
        filepath = os.path.join(SF_DIR_PATH, filename)
        taxi_data = pd.read_csv(
            filepath,
            sep=' ',
            index_col=False,
            names=["latitude", "longitude", "time"],
            usecols=[0, 1, 3])
        taxi_data.set_index('time', inplace=True)
        taxi_data.sort_index(inplace=True, ascending=True)
        # taxi_data["time"] = pd.to_datetime(taxi_data["time"], unit="s")
        taxi_id = os.path.splitext(filename)[0][4::]

        taxi_dataset[taxi_id] = taxi_data

print("The number of taxis:", len(taxi_dataset))

The number of taxis: 536


In [4]:
taxi_dataset["abboip"].head()

Unnamed: 0_level_0,latitude,longitude
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1211033530,37.75153,-122.39447
1211033614,37.75149,-122.39447
1211033674,37.75149,-122.39447
1211033735,37.75149,-122.39446
1211035303,37.75144,-122.39449


Check if there exists nan

In [5]:
any([data.isnull().any().any() for data in taxi_dataset.values()])

False

In [6]:
centers = pd.read_csv(os.path.join(DIR_PREFIX, CENTERS_LABELED_FILENAME))
centers.set_index('cluster-id',inplace=True)
centers.head()

Unnamed: 0_level_0,latitude,longitude,latitude-std,longitude-std,report-num,range,traffic-light,item
cluster-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,37.779615,-122.427175,0.000528,0.000373,53,55.209462,1,
6,37.792272,-122.443238,7.1e-05,8.8e-05,47,9.621562,0,
7,37.738951,-122.414804,7e-05,0.000291,48,25.542693,0,
17,37.72427,-122.40203,0.00032,0.000264,39,35.40501,0,加油站
20,37.771823,-122.404737,0.00049,0.000528,45,61.505712,0,窄


In [7]:
cluster_id_set = set(centers.index)
len(cluster_id_set)

100

In [60]:
def in_circle(a, b, b_range):
    return a <= b + b_range and a >= b - b_range

def in_center_range(x_center, y_center, x_center_std, y_center_std, x_point, y_point):
    return in_circle(x_point, x_center, x_center_std) and in_circle(y_point, y_center, y_center_std)

bypass_stat = dict()
_KEYS = ['latitude', 'longitude', 'latitude-std', 'longitude-std']
def into_cluster_info(cluster_info):
    cluster_id = cluster_info[0]
    cluster_info = {k: cluster_info[_KEYS.index(k) + 1] for k in _KEYS}
    return cluster_id, cluster_info


for taxi_name, trajectory in tqdm(taxi_dataset.items(), desc='taxi'):
    bypass_stat[taxi_name] = dict()

    for cluster_info in zip(centers.index, *[centers[k] for k in _KEYS]):
        cluster_id, cluster_info = into_cluster_info(cluster_info)

        bypass_counter = 0
        # counted = False
        x_center, y_center = cluster_info['latitude'], cluster_info['longitude']
        x_center_range, y_center_range = cluster_info['latitude-std'], cluster_info['longitude-std']
        for time, x, y in zip(trajectory.index, trajectory['latitude'], trajectory['longitude']):
            # if counted:
            #     continue

            in_range = in_center_range(x_center, y_center, x_center_range, y_center_range, x, y)
            if in_range:
                bypass_counter += 1
            # if not counted and in_range:
            #     counted = True
            #     bypass_counter += 1
            # elif not in_range:
            #     counted = False
            # else: # counted
            #     pass
        bypass_stat[taxi_name][cluster_id] = bypass_counter

taxi:   0%|          | 0/536 [00:00<?, ?it/s]

In [61]:
max([max([i for i in bypass.values()]) for bypass in bypass_stat.values()])

1047

In [62]:
[ bypass for bypass in bypass_stat.values()]

[{3: 5,
  6: 0,
  7: 0,
  17: 5,
  20: 3,
  21: 2,
  23: 19,
  26: 14,
  30: 8,
  33: 8,
  34: 54,
  36: 27,
  44: 21,
  46: 5,
  52: 7,
  56: 4,
  58: 3,
  59: 11,
  63: 14,
  69: 4,
  74: 0,
  80: 6,
  86: 4,
  87: 1,
  95: 3,
  100: 1,
  101: 4,
  104: 1,
  109: 1,
  120: 1,
  125: 0,
  126: 1,
  130: 16,
  132: 1,
  135: 0,
  137: 6,
  142: 0,
  155: 0,
  157: 10,
  161: 8,
  165: 6,
  169: 0,
  172: 1,
  176: 2,
  181: 1,
  189: 0,
  198: 11,
  217: 6,
  222: 5,
  226: 1,
  234: 3,
  242: 7,
  252: 6,
  259: 1,
  273: 0,
  275: 1,
  276: 0,
  280: 10,
  290: 0,
  309: 3,
  326: 7,
  361: 1,
  368: 0,
  379: 1,
  381: 4,
  384: 8,
  389: 47,
  399: 2,
  402: 1,
  425: 10,
  461: 8,
  467: 0,
  470: 3,
  493: 1,
  502: 3,
  514: 0,
  518: 0,
  519: 2,
  524: 0,
  546: 0,
  557: 0,
  562: 6,
  569: 4,
  576: 0,
  611: 0,
  617: 0,
  618: 0,
  644: 3,
  646: 3,
  647: 0,
  770: 2,
  785: 0,
  804: 0,
  845: 0,
  937: 0,
  953: 0,
  1085: 0,
  1172: 0,
  1281: 0,
  1409: 0},
 {3: 3,
  

# Waiting Counter

In [53]:
waiting_stat = pd.read_csv(os.path.join(DIR_PREFIX, WAITING_POSITIONS_FILENAME))
# waiting_stat.reset_index('time', inplace=True)
waiting_stat.drop_duplicates(inplace=True)
waiting_stat = waiting_stat[['id', 'cluster-id']]
waiting_stat['#waiting'] = 1
waiting_counter = waiting_stat.groupby(['id', 'cluster-id']).count()

In [37]:
waiting_counter

Unnamed: 0_level_0,Unnamed: 1_level_0,#waiting
id,cluster-id,Unnamed: 2_level_1
abmuyawm,-1,49
abmuyawm,0,45
abmuyawm,1,620
abmuyawm,2,3
abmuyawm,3,1
...,...,...
uvburki,1289,4
uvburki,1324,1
uvburki,1402,4
uvburki,1416,1


# Error rate for each client(taxi)

In [63]:
error_rates = waiting_counter.copy()
error_rates['#bypass'] = 0
for taxi_name in tqdm(taxi_dataset):
    for cluster_id in centers.index:
        if taxi_name in bypass_stat and cluster_id in bypass_stat[taxi_name]:
            error_rates['#bypass'][taxi_name, cluster_id] = bypass_stat[taxi_name][cluster_id]

  0%|          | 0/536 [00:00<?, ?it/s]

In [64]:
error_rates['#bypass'] += error_rates['#waiting']
error_rates['error_rate'] = 1.0 - error_rates['#waiting'] / error_rates['#bypass']

In [65]:
error_rates.describe()

Unnamed: 0,#waiting,#bypass,error_rate
count,9853.0,9853.0,9853.0
mean,7.254034,9.824622,0.145563
std,40.097629,43.38122,0.300989
min,1.0,1.0,0.0
25%,1.0,1.0,0.0
50%,2.0,2.0,0.0
75%,3.0,5.0,0.0
max,848.0,866.0,0.974359


In [70]:
error_rates_for_each_client =  error_rates.groupby('id')[['error_rate']].mean()

In [71]:
error_rates_for_each_client

Unnamed: 0_level_0,error_rate
id,Unnamed: 1_level_1
abmuyawm,0.153819
acgerl,0.190685
acpegho,0.116606
adpaifo,0.146584
agcowktu,0.172095
...,...
uthomoov,0.138035
uthsyis,0.127829
utjanre,0.124472
utwoab,0.145351


In [72]:
error_rates_for_each_client.to_csv(os.path.join(DIR_PREFIX, ERROR_RATES_FILENAME))

In [4]:
error_rates_for_each_client = pd.read_csv(os.path.join(DIR_PREFIX, ERROR_RATES_FILENAME))

In [5]:
error_rates_for_each_client.describe()

Unnamed: 0,error_rate
count,100.0
mean,0.141537
std,0.031016
min,0.0
25%,0.129662
50%,0.145968
75%,0.16231
max,0.207228
