In [1]:
import os
import yaml
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances

from utils import display
from s1_preprocessing.hotspot.hotpots_discovery_utils import cube_to_coordinate

In [2]:
def find_nearest_cube(targets, candidates):
    t_lng, t_lat = cube_to_coordinate(targets, to_geodetic=True, m=100, n=200)
    t_loc = np.concatenate((t_lat.reshape(-1, 1), t_lng.reshape(-1, 1)), axis=1)
    c_lng, c_lat = cube_to_coordinate(candidates, to_geodetic=True, m=100, n=200)
    c_loc = np.concatenate((c_lat.reshape(-1, 1), c_lng.reshape(-1, 1)), axis=1)
    nearest_idx = haversine_distances(np.radians(t_loc), np.radians(c_loc)).argmin(axis=1)
    return nearest_idx

In [3]:
def matrix_completion(missing_cube, raw_mat):
    missing_cube_nearest = find_nearest_cube(missing_cube, np.array(raw_mat.index.tolist()))
    supplement_cube = raw_mat.iloc[missing_cube_nearest]
    supplement_cube.set_index(missing_cube.astype(dtype=int), inplace=True)
    completion_mat = pd.concat([raw_mat, supplement_cube])
    return completion_mat

In [4]:
# configure the working directory to the project root path
with open("../../config.yaml", "r", encoding="utf8") as f:
    conf = yaml.load(f, Loader=yaml.FullLoader)
os.chdir(conf["project_path"])

In [5]:
p2d_raw_prob_mat = pd.read_csv(conf['mobility']['transition']['utility_xgboost']['p2d']['prob_mat_incomplete'],
                                   index_col=0)
d2p_raw_prob_mat = pd.read_csv(conf['mobility']['transition']['utility_xgboost']['d2p']['prob_mat_incomplete'],
                               index_col=0)
p_in_p2d = set(p2d_raw_prob_mat.index.tolist())
d_in_p2d = set(p2d_raw_prob_mat.columns.astype(dtype=int).tolist())
d_in_d2p = set(d2p_raw_prob_mat.index.tolist())
p_in_d2p = set(d2p_raw_prob_mat.columns.astype(dtype=int).tolist())

missing_p = np.array(list(p_in_d2p - p_in_p2d), dtype=np.float32)
p2d_prob_mat = matrix_completion(missing_p, p2d_raw_prob_mat)

missing_d = np.array(list(d_in_p2d - d_in_d2p), dtype=np.float32)
d2p_prob_mat = matrix_completion(missing_d, d2p_raw_prob_mat)

In [6]:
orig_idx_a, dest_idx_a = np.nonzero(p2d_prob_mat.to_numpy())
orig_idx_b, dest_idx_b = np.nonzero(d2p_prob_mat.to_numpy())
orig_cube_a, dest_cube_a = p2d_prob_mat.index[orig_idx_a], p2d_prob_mat.columns[dest_idx_a].astype(dtype=int)
orig_cube_b, dest_cube_b = d2p_prob_mat.index[orig_idx_b], d2p_prob_mat.columns[dest_idx_b].astype(dtype=int)
orig_cube = np.concatenate((orig_cube_a, orig_cube_b)).reshape(-1, 1)
dest_cube = np.concatenate((dest_cube_a, dest_cube_b)).reshape(-1, 1)
transition_cost = pd.DataFrame(np.concatenate((orig_cube, dest_cube), axis=1), columns=['o_cube', 'd_cube'])
transition_cost.drop_duplicates(inplace=True, ignore_index=True)

In [22]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from multiprocessing.dummy import Pool as ThreadPool
from tqdm import tqdm
tqdm.pandas()
transition_cost['o_lng'], transition_cost['o_lat'] =  cube_to_coordinate(transition_cost['o_cube'], to_geodetic=True, m=100, n=200)
transition_cost['d_lng'], transition_cost['d_lat'] =  cube_to_coordinate(transition_cost['d_cube'], to_geodetic=True, m=100, n=200)
temp_df = transition_cost.iloc[:100]
dd_tran_cost = dd.from_pandas(temp_df, npartitions=4)
import json
import urllib.request
def fetch_cost(transition):
    _a, _b, _c, _d = transition[['o_lng', 'o_lat', 'd_lng', 'd_lat']]
    contents = urllib.request.urlopen(
        "http://router.project-osrm.org/route/v1/driving/{},{};{},{}?overview=false".format(_a, _b, _c, _d)
    ).read()
    my_json = contents.decode('utf8').replace("'", '"')
    dis = json.loads(my_json)['routes'][0]['distance']
    duration = json.loads(my_json)['routes'][0]['duration']
    return dis
# with ProgressBar():
#     dd_tran_cost.apply(fetch_cost, axis=1, meta=pd.Series(dtype='float', name='test')).compute()
temp_df.progress_apply(fetch_cost, axis=1)

100%|██████████| 100/100 [00:46<00:00,  2.17it/s]


0     2733.0
1      605.9
2     1540.8
3     1791.0
4     2286.5
       ...  
95    3231.1
96    2130.6
97    1516.1
98    3071.3
99    1716.2
Length: 100, dtype: float64

In [24]:
%%time
from multiprocessing.dummy import Pool as ThreadPool
transition_cost['o_lng'], transition_cost['o_lat'] =  cube_to_coordinate(transition_cost['o_cube'], to_geodetic=True, m=100, n=200)
transition_cost['d_lng'], transition_cost['d_lat'] =  cube_to_coordinate(transition_cost['d_cube'], to_geodetic=True, m=100, n=200)

import json
import urllib.request
def make_url(transition):
    _a, _b, _c, _d = transition[['o_lng', 'o_lat', 'd_lng', 'd_lat']]
    url = "http://router.project-osrm.org/route/v1/driving/{},{};{},{}?overview=false".format(_a, _b, _c, _d)
    return url
# with ProgressBar():
#     dd_tran_cost.apply(fetch_cost, axis=1, meta=pd.Series(dtype='float', name='test')).compute()
ar = transition_cost.apply(make_url, axis=1).to_list()
pool = ThreadPool(10)
results = pool.map(lambda x: json.loads(urllib.request.urlopen(x).read().decode('utf8').replace("'", '"'))['routes'][0], ar)

Wall time: 40 s


In [30]:
transition_cost['distance'], transition_cost['duration'] = pd.DataFrame(results)['distance'], pd.DataFrame(results)['duration']