In [3]:
import datetime
import xgboost
import numpy as np
import pandas as pd
import utils.data_loader as data_loader
import utils.display as display
from scipy import stats
from sklearn.metrics.pairwise import haversine_distances
import sklearn.metrics
from hotspot.hotpots_discovery_utils import generate_cube_index

In [2]:
display.configure_logging()
display.configure_pandas()

In [3]:
df_od = data_loader.load_od(scale='full', common=False)
# 筛选出处于bbox中的points
df_od['in_bbox'] = ((113.764635 < df_od['destination_log'])
                     & (df_od['destination_log'] < 114.608972)
                     & (22.454727 < df_od['destination_lat'])
                     & (df_od['destination_lat'] < 22.842654)
                     & (113.764635 < df_od['original_log'])
                     & (df_od['original_log'] < 114.608972)
                     & (22.454727 < df_od['original_lat'])
                     & (df_od['original_lat'] < 22.842654))
df_od = df_od.loc[df_od.in_bbox].reset_index(drop=True)
df_od = generate_cube_index(df_od, m=100, n=200)

demand = df_od.groupby(['original_cube', 'destination_cube']).size().reset_index()
demand = demand.rename(columns={0: 'demand'})

demand = demand.loc[demand['demand'] > 10].reset_index()

14-Jul-20 20:02:31 - Loading C:\Users\hkrept\PycharmProjects\ElectricVehicleMobility\data\transaction_201407.csv


In [4]:
df_et_od = data_loader.load_od(scale='full', common=True)
# 筛选出处于bbox中的points
df_et_od['in_bbox'] = ((113.764635 < df_et_od['destination_log'])
                     & (df_et_od['destination_log'] < 114.608972)
                     & (22.454727 < df_et_od['destination_lat'])
                     & (df_et_od['destination_lat'] < 22.842654)
                     & (113.764635 < df_et_od['original_log'])
                     & (df_et_od['original_log'] < 114.608972)
                     & (22.454727 < df_et_od['original_lat'])
                     & (df_et_od['original_lat'] < 22.842654))
df_et_od = df_et_od.loc[df_et_od.in_bbox].reset_index(drop=True)
df_et_od = generate_cube_index(df_et_od, m=100, n=200)

et_demand = df_et_od.groupby(['original_cube', 'destination_cube']).size().reset_index()
et_demand = et_demand.rename(columns={0: 'demand'})

14-Jul-20 20:03:01 - Loading C:\Users\hkrept\PycharmProjects\ElectricVehicleMobility\data/transaction_common_201407.csv


In [5]:
df_demands = pd.merge(demand, et_demand, how='left', on=['original_cube', 'destination_cube'], suffixes=('_all', '_et'))
df_demands = df_demands.fillna(0)
df_demands['rate'] = df_demands['demand_et'] / df_demands['demand_all']

In [6]:
temp = df_od.drop_duplicates(['original_cube', 'destination_cube'])
df_od_pairs = pd.merge(df_demands[['original_cube', 'destination_cube']],
                       df_od[['original_cube', 'destination_cube', 'original_log', 'original_lat', 
                              'destination_log', 'destination_lat']].drop_duplicates(['original_cube', 'destination_cube']),
                       left_on=['original_cube', 'destination_cube'], 
                       right_on=['original_cube', 'destination_cube'])

df_cs, date = data_loader.load_cs(scale='part', date=datetime.datetime(2014, 7, 1))
df_cs = df_cs.loc[~df_cs['cs_name'].isin(['LJDL', 'E04', 'BN0002', 'F11', 'S1', 'S2', 'F12', 'F13', 'F15'])]
df_cs.reset_index(drop=True, inplace=True)

# select drop location and CS location as two array
original_locations = df_od_pairs[['original_lat', 'original_log']].to_numpy()
destination_locations = df_od_pairs[['destination_lat', 'destination_log']].to_numpy()
cs_location = df_cs[['Latitude', 'Longitude']].to_numpy()

# earth radius(km)
AVG_EARTH_RADIUS = 6371.0088

# calculate distance between drop location and CS location, and midian, min, max, mean
original_distances_to_cs = haversine_distances(np.radians(original_locations), np.radians(cs_location)) * AVG_EARTH_RADIUS
destination_distances_to_cs = haversine_distances(np.radians(destination_locations), np.radians(cs_location)) * AVG_EARTH_RADIUS
df_original_distances_to_cs = pd.DataFrame(original_distances_to_cs)
df_destination_distances_to_cs = pd.DataFrame(destination_distances_to_cs)

14-Jul-20 20:03:03 - Loading 'C:\Users\hkrep\PycharmProjects\ChargingEventsExtraction\data\ChargingStation'


In [7]:
def mape_vectorized_v2(a, b): 
    mask = a != 0
    return (np.fabs(a - b)/a)[mask].mean()

In [8]:
def evaluate(n=10):
    original_capacity = np.repeat(df_cs['chg_points'].values.reshape(1, -1), df_od_pairs.shape[0], axis=0)
    
    a = df_original_distances_to_cs.values
    a.sort(axis=1)
    a = pd.DataFrame(a, df_original_distances_to_cs.index, df_original_distances_to_cs.columns)
    a = a.iloc[:, :n]
    o_dissorted_capacity = np.take_along_axis(original_capacity, df_original_distances_to_cs.values.argsort(axis=1), axis=1)
    o_dissorted_capacity = pd.DataFrame(o_dissorted_capacity)
    o_dissorted_capacity = o_dissorted_capacity.iloc[:, :n]
    
    b = df_destination_distances_to_cs.values
    b.sort(axis=1)
    b = pd.DataFrame(b, df_destination_distances_to_cs.index, df_destination_distances_to_cs.columns)
    b = b.iloc[:, :n]
    d_dissorted_capacity = np.take_along_axis(original_capacity, df_destination_distances_to_cs.values.argsort(axis=1), axis=1)
    d_dissorted_capacity = pd.DataFrame(d_dissorted_capacity)
    d_dissorted_capacity = d_dissorted_capacity.iloc[:, :n]
    
    train_x = pd.concat([a, o_dissorted_capacity, b, d_dissorted_capacity], axis=1).iloc[:int(0.7*a.shape[0])]
    test_x = pd.concat([a, o_dissorted_capacity, b, d_dissorted_capacity], axis=1).iloc[int(0.7*a.shape[0]):]
    train_y = df_demands['rate'].iloc[:int(0.7*a.shape[0])]
    test_y = df_demands['rate'].iloc[int(0.7*a.shape[0]):]
#     print(train_x.iloc[:3], train_y.iloc[:3])
#     print(test_x.iloc[:3], test_y.iloc[:3])
    gbm = xgboost.XGBRegressor(verbosity=1, max_depth=10, learning_rate=0.05, n_estimators=500, scale_pos_weight=2) #这行会有个提示，不用管
    gbm.fit(train_x.values, train_y.values)
    predict_y = gbm.predict(test_x.values)
    
    print(sklearn.metrics.mean_absolute_error(test_y, predict_y), sklearn.metrics.mean_squared_error(test_y, predict_y))
    print(mape_vectorized_v2(test_y, predict_y))

print(df_od_pairs.shape[0], 'sample')
for i in range(1, 24):
    print(i, 'nearest cs as feature:')
    evaluate(n=i)

106788 sample
1 nearest cs as feature:
0.03986787878145368 0.0030707246156373227
0.5130642375794054
2 nearest cs as feature:
0.03951798126982014 0.002931755808633905
0.496760343910151
3 nearest cs as feature:
0.0404078168934603 0.0028932540690789856
0.5159218848383474
4 nearest cs as feature:
0.04072224289728923 0.0029544486074978117
0.5238698955755167
5 nearest cs as feature:
0.045270584206721594 0.0034302777112109758
0.6179571665356273
6 nearest cs as feature:
0.04390052259172482 0.0032466943156795693
0.5858621264542101
7 nearest cs as feature:
0.043715177914717455 0.0032192182829961775
0.587664423245331
8 nearest cs as feature:
0.043553875081353784 0.003155401089737722
0.5846863814080752
9 nearest cs as feature:
0.04287185209869326 0.0031119952320785844
0.5680124509405862
10 nearest cs as feature:
0.04465659364153703 0.0032990106654289257
0.6126721846546823
11 nearest cs as feature:
0.043736652105987636 0.0031931422753202375
0.5924421813005173
12 nearest cs as feature:
0.04442070202