In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances
from dask.diagnostics import ProgressBar

In [2]:
from tqdm import tqdm

In [3]:
trajectories = dd.read_parquet('data/trajectory/statuses_wo_charging_resting')
cs = pd.read_csv('data/charging_station/ChargeLocation201706_wgs84.csv')

In [4]:
cs_wg = pd.read_csv('data/charging_station/ChargeLocation201706', names=['idx', 'name', 'lng', 'lat', 'online', 'charger_num'])

In [5]:
def find_nearest_station(traj, cs_info=None):
    dis = haversine_distances(np.radians(traj), np.radians(cs_info))
    return dis.argmin(axis=1), dis.min(axis=1)
result = trajectories[['latitude', 'longitude']].map_partitions(find_nearest_station, cs_info=cs_wg[['lat', 'lng']], meta={0: int, 1: float})

with ProgressBar():
    # num_workers=7 is the best, whole distance calculation canbe done at about 25 min, 
    # and the memory is used appropriately
    nearest_info = result.compute(scheduler='processes', num_workers=7)

[########################################] | 100% Completed | 16min 23.8s


In [6]:
df = pd.DataFrame(nearest_info.tolist(), index=nearest_info.index, columns=['s_idx', 's_dis'])
s_idx = [element for list_ in tqdm(df['s_idx'].values) for element in list_]
s_dis = [element for list_ in tqdm(df['s_dis'].values) for element in list_]

100%|██████████| 21/21 [01:25<00:00,  4.05s/it]
100%|██████████| 21/21 [01:26<00:00,  4.13s/it]


In [7]:
traj = pd.read_parquet('data/trajectory/statuses_wo_charging_resting')
traj['s_idx'], traj['s_dis'] = pd.Series(s_idx), pd.Series(s_dis) * 6371008.8
traj.to_parquet('data/trajectory/trajectories_w_statuses_wgcs')

In [10]:
traj.iloc[100:150]

Unnamed: 0,license,longitude,latitude,timestamp,speed,occupied,occupied_from_od,dis2pre,dur2pre,big_dur,valid,stop,s_idx,s_dis
100,粤B001ZD,113.8815,22.584299,2017-06-01 06:18:09,11,False,False,0.293961,49.0,False,True,False,49,2301.518022
101,粤B001ZD,113.883797,22.581699,2017-06-01 06:18:59,19,False,False,0.373094,50.0,False,True,False,49,2027.460056
102,粤B001ZD,113.881798,22.577499,2017-06-01 06:19:50,33,False,False,0.510128,51.0,False,True,False,49,1545.972807
103,粤B001ZD,113.878601,22.574699,2017-06-01 06:20:40,0,False,False,0.45242,50.0,False,True,True,49,1267.034315
104,粤B001ZD,113.878601,22.574699,2017-06-01 06:20:40,0,False,False,0.0,0.0,False,True,True,49,1267.034315
105,粤B001ZD,113.876602,22.572901,2017-06-01 06:21:30,23,False,False,0.286529,50.0,False,True,False,49,1145.50366
106,粤B001ZD,113.871498,22.5748,2017-06-01 06:22:20,11,False,False,0.565,50.0,False,True,False,19,1442.231306
107,粤B001ZD,113.869102,22.576099,2017-06-01 06:23:10,0,False,False,0.285279,50.0,False,True,True,19,1161.540084
108,粤B001ZD,113.871803,22.573999,2017-06-01 06:23:59,0,False,True,0.362541,49.0,False,True,True,19,1504.733652
109,粤B001ZD,113.871803,22.573999,2017-06-01 06:24:50,0,False,True,0.0,51.0,False,True,True,19,1504.733652
