In [17]:
import datetime
import xgboost
import numpy as np
import pandas as pd
import utils.data_loader as data_loader
import utils.display as display
import utils.vector_haversine_distances as vec_hs_dis
from scipy import stats
from sklearn.metrics.pairwise import haversine_distances
import sklearn.metrics
from hotspot.hotpots_discovery_utils import generate_cube_index

In [18]:
display.configure_logging()
display.configure_pandas()

In [19]:
def mape_vectorized_v2(a, b):
    mask = a != 0
    return (np.fabs(a - b)/a)[mask].mean()

In [20]:
df_od = data_loader.load_od(scale='full', common=False)
# 筛选出处于bbox中的points
df_od['in_bbox'] = ((113.764635 < df_od['destination_log'])
                     & (df_od['destination_log'] < 114.608972)
                     & (22.454727 < df_od['destination_lat'])
                     & (df_od['destination_lat'] < 22.842654)
                     & (113.764635 < df_od['original_log'])
                     & (df_od['original_log'] < 114.608972)
                     & (22.454727 < df_od['original_lat'])
                     & (df_od['original_lat'] < 22.842654))
df_od = df_od.loc[df_od.in_bbox].reset_index(drop=True)
df_od = generate_cube_index(df_od, m=100, n=200)

demand = df_od.groupby(['original_cube', 'destination_cube']).size().reset_index()
demand = demand.rename(columns={0: 'demand'})

demand = demand.loc[demand['demand'] > 10].reset_index()

17-Jul-20 16:32:53 - Loading data/transaction_201407.csv


In [21]:
df_et_od = data_loader.load_od(scale='full', common=True)
# 筛选出处于bbox中的points
df_et_od['in_bbox'] = ((113.764635 < df_et_od['destination_log'])
                     & (df_et_od['destination_log'] < 114.608972)
                     & (22.454727 < df_et_od['destination_lat'])
                     & (df_et_od['destination_lat'] < 22.842654)
                     & (113.764635 < df_et_od['original_log'])
                     & (df_et_od['original_log'] < 114.608972)
                     & (22.454727 < df_et_od['original_lat'])
                     & (df_et_od['original_lat'] < 22.842654))
df_et_od = df_et_od.loc[df_et_od.in_bbox].reset_index(drop=True)
df_et_od = generate_cube_index(df_et_od, m=100, n=200)

et_demand = df_et_od.groupby(['original_cube', 'destination_cube']).size().reset_index()
et_demand = et_demand.rename(columns={0: 'demand'})

17-Jul-20 16:33:22 - Loading data/transaction_common_201407.csv


In [22]:
df_demands = pd.merge(demand, et_demand, how='left', on=['original_cube', 'destination_cube'], suffixes=('_all', '_et'))
df_demands = df_demands.fillna(0)
df_demands['rate'] = df_demands['demand_et'] / df_demands['demand_all']

In [23]:
df_od_pairs = pd.merge(df_demands[['original_cube', 'destination_cube']],
                       df_od[['original_cube', 'destination_cube', 'original_log', 'original_lat', 'destination_log',
                              'destination_lat']].drop_duplicates(['original_cube', 'destination_cube']),
                       left_on=['original_cube', 'destination_cube'], 
                       right_on=['original_cube', 'destination_cube'])

df_cs, date = data_loader.load_cs(scale='part', date=datetime.datetime(2014, 7, 1))
df_cs = df_cs.loc[~df_cs['cs_name'].isin(['LJDL', 'E04', 'BN0002', 'F11', 'S1', 'S2', 'F12', 'F13', 'F15'])]
df_cs.reset_index(drop=True, inplace=True)

# select drop location and CS location as two array
original_locations = df_od_pairs[['original_lat', 'original_log']].to_numpy()
destination_locations = df_od_pairs[['destination_lat', 'destination_log']].to_numpy()
cs_location = df_cs[['Latitude', 'Longitude']].to_numpy()

# earth radius(km)
AVG_EARTH_RADIUS = 6371.0088

# calculate distance between original/destination location and CS location
original_distances_to_cs = haversine_distances(np.radians(original_locations), np.radians(cs_location)) \
                           * AVG_EARTH_RADIUS
destination_distances_to_cs = haversine_distances(np.radians(destination_locations), np.radians(cs_location)) \
                              * AVG_EARTH_RADIUS
# calculate distance between od locations
od_dis = vec_hs_dis.haversine_np(df_od_pairs['original_log'], df_od_pairs['original_lat'],
                                 df_od_pairs['destination_log'], df_od_pairs['destination_lat'])

df_original_distances_to_cs = pd.DataFrame(original_distances_to_cs)
df_destination_distances_to_cs = pd.DataFrame(destination_distances_to_cs)
df_od_dis = pd.DataFrame(od_dis)

17-Jul-20 16:33:25 - Loading 'C:\Users\hkrep\PycharmProjects\ChargingEventsExtraction\data\ChargingStation'


In [24]:
def evaluate(n=10):
    original_capacity = np.repeat(df_cs['chg_points'].values.reshape(1, -1), df_od_pairs.shape[0], axis=0)
    
    a = df_original_distances_to_cs.values
    a.sort(axis=1)
    a = pd.DataFrame(a, df_original_distances_to_cs.index, df_original_distances_to_cs.columns)
    a = a.iloc[:, :n]
    o_dissorted_capacity = np.take_along_axis(original_capacity, df_original_distances_to_cs.values.argsort(axis=1),
                                              axis=1)
    o_dissorted_capacity = pd.DataFrame(o_dissorted_capacity)
    o_dissorted_capacity = o_dissorted_capacity.iloc[:, :n]
    
    b = df_destination_distances_to_cs.values
    b.sort(axis=1)
    b = pd.DataFrame(b, df_destination_distances_to_cs.index, df_destination_distances_to_cs.columns)
    b = b.iloc[:, :n]
    d_dissorted_capacity = np.take_along_axis(original_capacity, df_destination_distances_to_cs.values.argsort(axis=1),
                                              axis=1)
    d_dissorted_capacity = pd.DataFrame(d_dissorted_capacity)
    d_dissorted_capacity = d_dissorted_capacity.iloc[:, :n]
    
    train_x = pd.concat([od_dis, a, o_dissorted_capacity, b, d_dissorted_capacity], axis=1).iloc[:int(0.7*a.shape[0])]
    test_x = pd.concat([od_dis, a, o_dissorted_capacity, b, d_dissorted_capacity], axis=1).iloc[int(0.7*a.shape[0]):]
    train_y = df_demands['rate'].iloc[:int(0.7*a.shape[0])]
    test_y = df_demands['rate'].iloc[int(0.7*a.shape[0]):]
#     print(train_x.iloc[:3], train_y.iloc[:3])
#     print(test_x.iloc[:3], test_y.iloc[:3])
    gbm = xgboost.XGBRegressor(verbosity=1, max_depth=10, learning_rate=0.05, n_estimators=500, scale_pos_weight=2) #这行会有个提示，不用管
    gbm.fit(train_x.values, train_y.values)
    predict_y = gbm.predict(test_x.values)
    
    print(sklearn.metrics.mean_absolute_error(test_y, predict_y), sklearn.metrics.mean_squared_error(test_y, predict_y))
    print(mape_vectorized_v2(test_y, predict_y))

print(df_od_pairs.shape[0], 'sample')
for i in range(1, 10):
    print(i, 'nearest cs as feature:')
    evaluate(n=i)

106788 sample
1 nearest cs as feature:
0.040236326338838946 0.0030538080092436023
0.5100674802903681
2 nearest cs as feature:
0.03942032365447463 0.0028470197120359607
0.49327796068378804
3 nearest cs as feature:


KeyboardInterrupt: 