In [6]:
import datetime
import xgboost
import numpy as np
import pandas as pd
import utils.data_loader as data_loader
import utils.display as display
import utils.vector_haversine_distances as vec_hs_dis
from scipy import stats
from sklearn.metrics.pairwise import haversine_distances
from sklearn.metrics import mean_squared_error, mean_absolute_error
import sklearn
from hotspot.hotpots_discovery_utils import generate_cube_index, cube_to_coordinate
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer

In [7]:
display.configure_logging()
display.configure_pandas()

In [8]:
def mape_vectorized_v2(a, b):
    b = b.reshape(1, -1)
    mask = a != 0
    a = a[mask]
    b = b[mask]
    return (np.fabs(a - b)/a).mean()

In [9]:
df_od = data_loader.load_od(scale='full', common=False)
# 筛选出处于bbox中的points
df_od['in_bbox'] = ((113.764635 < df_od['destination_log'])
                     & (df_od['destination_log'] < 114.608972)
                     & (22.454727 < df_od['destination_lat'])
                     & (df_od['destination_lat'] < 22.842654)
                     & (113.764635 < df_od['original_log'])
                     & (df_od['original_log'] < 114.608972)
                     & (22.454727 < df_od['original_lat'])
                     & (df_od['original_lat'] < 22.842654))
df_od = df_od.loc[df_od.in_bbox].reset_index(drop=True)
df_od = generate_cube_index(df_od, m=100, n=200)

demand = df_od.groupby(['original_cube', 'destination_cube']).size().reset_index()
demand = demand.rename(columns={0: 'demand'})

demand = demand.loc[demand['demand'] > 10].reset_index()

21-Jul-20 19:31:59 - Loading data/transaction_201407.csv


In [10]:
df_et_od = data_loader.load_od(scale='full', common=True)
# 筛选出处于bbox中的points
df_et_od['in_bbox'] = ((113.764635 < df_et_od['destination_log'])
                     & (df_et_od['destination_log'] < 114.608972)
                     & (22.454727 < df_et_od['destination_lat'])
                     & (df_et_od['destination_lat'] < 22.842654)
                     & (113.764635 < df_et_od['original_log'])
                     & (df_et_od['original_log'] < 114.608972)
                     & (22.454727 < df_et_od['original_lat'])
                     & (df_et_od['original_lat'] < 22.842654))
df_et_od = df_et_od.loc[df_et_od.in_bbox].reset_index(drop=True)
df_et_od = generate_cube_index(df_et_od, m=100, n=200)

et_demand = df_et_od.groupby(['original_cube', 'destination_cube']).size().reset_index()
et_demand = et_demand.rename(columns={0: 'demand'})

21-Jul-20 19:32:30 - Loading data/transaction_common_201407.csv


In [11]:
df_demands = pd.merge(demand, et_demand, how='left', on=['original_cube', 'destination_cube'], suffixes=('_all', '_et'))
df_demands = df_demands.fillna(0)
df_demands['rate'] = df_demands['demand_et'] / df_demands['demand_all']

In [12]:
df_od['duration'] = (df_od['end_time'] - df_od['begin_time']).dt.total_seconds()
df_od = df_od[['original_cube', 'destination_cube', 'original_log', 'original_lat', 'destination_log',
               'destination_lat', 'duration']].groupby(['original_cube', 'destination_cube']).mean()
df_od.reset_index(inplace=True)

In [13]:
df_od_pairs = pd.merge(df_demands[['original_cube', 'destination_cube']],
                       df_od[['original_cube', 'destination_cube', 'original_log', 'original_lat', 'destination_log',
                              'destination_lat', 'duration']],
                       left_on=['original_cube', 'destination_cube'],
                       right_on=['original_cube', 'destination_cube'])

# df_od_pairs = df_od_pairs.sample(frac=1).reset_index(drop=True)

df_cs, date = data_loader.load_cs(scale='part', date=datetime.datetime(2014, 7, 1))
df_cs = df_cs.loc[~df_cs['cs_name'].isin(['LJDL', 'E04', 'BN0002', 'F11', 'S1', 'S2', 'F12', 'F13', 'F15'])]
df_cs.reset_index(drop=True, inplace=True)

# select drop location and CS location as two array
original_locations = df_od_pairs[['original_lat', 'original_log']].to_numpy()
destination_locations = df_od_pairs[['destination_lat', 'destination_log']].to_numpy()
cs_location = df_cs[['Latitude', 'Longitude']].to_numpy()

# earth radius(km)
AVG_EARTH_RADIUS = 6371.0088

# calculate distance between original/destination location and CS location
original_distances_to_cs = haversine_distances(np.radians(original_locations), np.radians(cs_location)) \
                           * AVG_EARTH_RADIUS
destination_distances_to_cs = haversine_distances(np.radians(destination_locations), np.radians(cs_location)) \
                              * AVG_EARTH_RADIUS
# calculate distance between od locations
od_dis = vec_hs_dis.haversine_np(df_od_pairs['original_log'], df_od_pairs['original_lat'],
                                 df_od_pairs['destination_log'], df_od_pairs['destination_lat'])

df_original_distances_to_cs = pd.DataFrame(original_distances_to_cs)
df_destination_distances_to_cs = pd.DataFrame(destination_distances_to_cs)
df_od_dis = pd.DataFrame(od_dis)

21-Jul-20 19:32:33 - Loading 'C:\Users\hkrep\PycharmProjects\ChargingEventsExtraction\data\ChargingStation'


In [14]:
def evaluate(n=10):
    original_capacity = np.repeat(df_cs['chg_points'].values.reshape(1, -1), df_od_pairs.shape[0], axis=0)

    a = df_original_distances_to_cs.values
    a.sort(axis=1)
    a = pd.DataFrame(a, df_original_distances_to_cs.index, df_original_distances_to_cs.columns)
    a = a.iloc[:, :n]
    o_dissorted_capacity = np.take_along_axis(original_capacity, df_original_distances_to_cs.values.argsort(axis=1),
                                              axis=1)
    o_dissorted_capacity = pd.DataFrame(o_dissorted_capacity)
    o_dissorted_capacity = o_dissorted_capacity.iloc[:, :n]

    b = df_destination_distances_to_cs.values
    b.sort(axis=1)
    b = pd.DataFrame(b, df_destination_distances_to_cs.index, df_destination_distances_to_cs.columns)
    b = b.iloc[:, :n]
    d_dissorted_capacity = np.take_along_axis(original_capacity, df_destination_distances_to_cs.values.argsort(axis=1),
                                              axis=1)
    d_dissorted_capacity = pd.DataFrame(d_dissorted_capacity)
    d_dissorted_capacity = d_dissorted_capacity.iloc[:, :n]

    # train_x, val_x, train_y, val_y = train_test_split(
    #     pd.concat([df_od_pairs['duration'], od_dis, a, o_dissorted_capacity, b, d_dissorted_capacity],axis=1).values,
    #     df_demands['rate'].values, test_size=0.2)

    train_x, val_x, train_y, val_y = train_test_split(
    pd.concat([df_od_pairs['duration'], od_dis, a, o_dissorted_capacity], axis=1).values,
    df_demands['rate'].values, test_size=0.2)

    scaler = sklearn.preprocessing.StandardScaler()
    train_x = scaler.fit_transform(train_x)
    val_x = scaler.transform(val_x)

    gbm = xgboost.XGBRegressor(verbosity=0, n_estimators=100, validate_parameters=2, 
                               learning_rate=0.05, min_child_weight=5, max_depth=8)

    gbm.fit(train_x, train_y)
    predict_y = gbm.predict(val_x)

    print('mae:', mean_absolute_error(val_y, predict_y),
          'mse:', mean_squared_error(val_y, predict_y),
          'rmse:', mean_squared_error(val_y, predict_y, squared=False))
    print('mape:', mape_vectorized_v2(val_y.reshape(1, -1), predict_y))
    print('R2:', gbm.score(val_x, val_y))
    print(val_y[30:36], )
    print(predict_y[30:36])

print(df_od_pairs.shape[0], 'sample')
for i in range(3, 4):
    print(i, 'nearest cs as feature:')
    evaluate(n=i)

106788 sample
3 nearest cs as feature:
mae: 0.03102669470782013 mse: 0.001859535837341607 rmse: 0.04312233571296442
mape: 0.46498741034637703
R2: 0.23435227072533316
[0.         0.01818182 0.         0.         0.         0.        ]
[0.02439219 0.02746636 0.00523108 0.05107847 0.00451407 0.02001056]


In [15]:
exit(0)
# This cell is used to search parameters.
def evaluate(n=10):
    original_capacity = np.repeat(df_cs['chg_points'].values.reshape(1, -1), df_od_pairs.shape[0], axis=0)

    a = df_original_distances_to_cs.values
    a.sort(axis=1)
    a = pd.DataFrame(a, df_original_distances_to_cs.index, df_original_distances_to_cs.columns)
    a = a.iloc[:, :n]
    o_dissorted_capacity = np.take_along_axis(original_capacity, df_original_distances_to_cs.values.argsort(axis=1),
                                              axis=1)
    o_dissorted_capacity = pd.DataFrame(o_dissorted_capacity)
    o_dissorted_capacity = o_dissorted_capacity.iloc[:, :n]

    b = df_destination_distances_to_cs.values
    b.sort(axis=1)
    b = pd.DataFrame(b, df_destination_distances_to_cs.index, df_destination_distances_to_cs.columns)
    b = b.iloc[:, :n]
    d_dissorted_capacity = np.take_along_axis(original_capacity, df_destination_distances_to_cs.values.argsort(axis=1),
                                              axis=1)
    d_dissorted_capacity = pd.DataFrame(d_dissorted_capacity)
    d_dissorted_capacity = d_dissorted_capacity.iloc[:, :n]

    gbm = xgboost.XGBRegressor(verbosity=0, validate_parameters=2,)

    param_grid = {
        'n_estimators': [50, 100],
        'scale_pos_weight': [1, 3],
        'max_depth': [5, 10],
        'learning_rate': [0.05, 0.1],
        'min_child_weight': [1, 3],
        'gamma': [0, 5,],
        'max_delta_step': [0, 3],
        'subsample': [0.7, 1],
        'colsample_bytree': [0.7, 1]
    }
    gs = GridSearchCV(gbm, param_grid=param_grid, cv=5)

    train_x, val_x, train_y, val_y = train_test_split(
        pd.concat([df_od_pairs['duration'], od_dis, a, o_dissorted_capacity, b, d_dissorted_capacity],axis=1).values,
        df_demands['rate'].values, test_size=0.2)
    print(train_x.shape, train_y.shape)
    print(val_x.shape, val_y.shape)
    scaler = sklearn.preprocessing.StandardScaler()
    train_x = scaler.fit_transform(train_x)
    val_x = scaler.transform(val_x)

    gs.fit(train_x, train_y)

    print(gs.best_params_)
    print(gs.best_score_)
    print(gs.best_estimator_)
    print(gs.best_index_)

    gs.best_estimator_.fit(train_x, train_y)
    predict_y = gs.best_estimator_.predict(val_x)

    print('mape:', mape_vectorized_v2(val_y.reshape(-1), predict_y))
    print('xgb score (rmse in xgb doc):', gs.best_estimator_.score(val_x, val_y))
    print('gs score (neg rmse):', gs.score(val_x, val_y))
    print('sample_gt:', val_y[10:15])
    print('sample_pred:', predict_y[10:15])

print(df_od_pairs.shape[0], 'sample')
for i in range(1, 10):
    print(i, 'nearest cs as feature:')
    evaluate(n=i)

106788 sample
1 nearest cs as feature:
(85430, 6) (85430,)
(21358, 6) (21358,)


KeyboardInterrupt: 