In [1]:
import os
import yaml
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, normalize
from utils import display, od_utils
from s1_preprocessing.hotspot.hotpots_discovery_utils import generate_cube_index
from s2_mobility.transit_prediction.s2_utility_XGBoost_train import kl

In [2]:
# configure the working directory to the project root path
with open("../../config.yaml", "r", encoding="utf8") as f:
    conf = yaml.load(f, Loader=yaml.FullLoader)
os.chdir(conf["project_path"])
display.configure_pandas()

od = pd.read_parquet(conf['od']['raw1706_pqt'])
od = od_utils.filter_in_bbox(od)
od = generate_cube_index(od, m=100, n=200)
od_pairs_demand = pd.read_csv(conf['mobility']['transition']['utility_xgboost']['p2d']['result'])

percent_list = []
kl_list = []
for percentage_int in range(1, 100, 1):
    percentage = percentage_int / 100
    od_sample = od.sample(frac=percentage)
    demand = od_sample.groupby(['original_cube', 'destination_cube']).size().rename('sample')
    all_and_sample = pd.merge(od_pairs_demand, demand,
                              how='left',
                              left_on=['original_cube', 'destination_cube'], right_index=True).fillna(0.0001)
    kl_sample = all_and_sample.groupby("original_cube").apply(
        lambda x: kl(normalize(x["demand_17_et"].to_numpy().reshape(1, -1), norm='l1'),
                     normalize(x["sample"].to_numpy().reshape(1, -1), norm='l1'))
    )
    print(percentage, kl_sample.mean())
    percent_list.append(percentage)
    kl_list.append(kl_sample.mean())

0.01 3.010456053087922
0.02 2.7316930315679735
0.03 2.5094032533545825
0.04 2.2873189176619446
0.05 2.05800415300236
0.06 1.8763544344174203
0.07 1.7650737516938644
0.08 1.6052279604936017
0.09 1.4905850967239467
0.1 1.3766550823672727
0.11 1.2919310644501505
0.12 1.1854184381619832
0.13 1.1250213900044064
0.14 0.999368366401149
0.15 0.9894993608875974
0.16 0.9115013322693538
0.17 0.8909573485801372
0.18 0.8151217752087035
0.19 0.7659345447807703
0.2 0.7532601680240706
0.21 0.7172117775235208


KeyboardInterrupt: 

In [None]:
font = {'family': 'Palatino Linotype',
        'weight': 'normal',
        'size': 15,
        }

import matplotlib.pyplot as plt
from brokenaxes import brokenaxes
import numpy as np
fig = plt.figure(figsize=(5.5, 4), dpi=600)
ax=brokenaxes(xlims=((-0.01,1.01),), ylims=((-0.05,3),(20.5, 21)), hspace=0.15, left = 0.175, bottom = 0.21)#hspace指两个断点之间的距离
#https://github.com/bendichter/brokenaxes/issues/20
ax.plot(percent_list, kl_list, label='Sampled data of taxis at 2017')
ax.plot(664/10000, 20.83, '.', label='Data of only EV taxis at 2014')
ax.plot(0.99, 0.21, '.', label='Data of taxis at 2014')
ax.legend(prop=font)
ax.set_xlabel('# of taxis / # of taxis at 2016', labelpad = 0, size=16, family='Palatino Linotype')
ax.set_ylabel('KL-Divergence', labelpad = 0, size=16, family='Palatino Linotype')
ax.tick_params(labelsize=12, )
# fig.tight_layout()
fig.show()
fig.savefig('kldiv_comparison.pdf')