In [1]:
import statistics
import datetime
import pickle

import utils.data_loader as data_loader
import utils.display as display
import pandas as pd
import numpy as np
from sklearn import preprocessing, metrics
import xgboost as xgb
from tqdm import tqdm

display.configure_pandas()
display.configure_logging()
tqdm.pandas()

  from pandas import Panel


In [2]:
od = data_loader.load_od(with_hotpots=True, version='v4')

cols = ['Licence', 'begin_time', 'end_time', 'traveled_after_charged', 'to_charge', 'seeking_duration', 
        'min_dis', 'mid_dis', 'mean_dis', 'max_dis']
df_od = data_loader.load_od(scale='full',with_feature=True)[cols]

22-Jul-20 12:48:48 - Loading data/od/full_od_with_hotpots_v4.csv
22-Jul-20 12:48:48 - Loading data/od/od_with_traveled_v5.csv


In [3]:
df_od.dropna(axis=0, subset=['traveled_after_charged'], inplace=True)
df_od.reset_index(drop=True, inplace=True)
df_od['grp'] = ((False == df_od['to_charge'])
                & ((True == df_od['to_charge'].shift()) | df_od['to_charge'].shift().isna())).cumsum()
# df_od.loc[df_od['to_charge'], 'traveled_after_charged'].plot.kde(ind=range(0, 300, 20))
# (df_od.loc[df_od['to_charge'], 'seeking_duration'] / np.timedelta64(1, 'm')).plot.kde(ind=range(0, 90))

In [4]:
def process_grp(od):
    if od.at[od.index[-1], 'traveled_after_charged'] > 160:
        od['grp'] = 0
    if od.at[od.index[-1], 'traveled_after_charged'] < 80:
        od['grp'] = 0
    if od.at[od.index[-1], 'seeking_duration'] > np.timedelta64(60, 'm'):
        od['grp'] = 0
    return od

df_od = df_od.groupby('grp').progress_apply(process_grp)
df_od = df_od.loc[0 != df_od['grp']]
df_od.reset_index(drop=True, inplace=True)
df_od['to_charge'] = df_od['to_charge'].astype(int)
df_od['time_of_day'] = df_od['end_time'].dt.hour + df_od['end_time'].dt.minute / 60
data = df_od

100%|██████████| 26603/26603 [00:22<00:00, 1163.63it/s]


In [6]:
scaler = preprocessing.StandardScaler()

tranformed_data = scaler.fit_transform(df_od[['traveled_after_charged', 'min_dis', 'mid_dis', 'mean_dis', 'max_dis',
                                              'time_of_day']])

tranformed_data = pd.DataFrame(tranformed_data, columns=['traveled_after_charged', 'min_dis', 'mid_dis', 'mean_dis',
                                                         'max_dis', 'time_of_day'])
tranformed_data['begin_time'] = df_od['begin_time']
tranformed_data['Licence'] = df_od['Licence']
tranformed_data['to_charge'] = df_od['to_charge']

data = tranformed_data

In [7]:
def func(od):
    feature_columns_to_use = ['time_of_day', 'min_dis', 'max_dis', 'mean_dis', 'mid_dis', 'traveled_after_charged']
    msk = np.random.rand(len(od)) < 0.8
    train_set = od[msk]
    test_set = od[~msk]

    train_X = train_set[feature_columns_to_use].to_numpy()
    train_y = train_set['to_charge']
    test_X = test_set[feature_columns_to_use].to_numpy()
    test_y = test_set['to_charge']
    
    gbm = xgb.XGBClassifier(verbosity=1, max_depth=10, learning_rate=0.05, n_estimators=500, scale_pos_weight=6.7)
    gbm.fit(train_X, train_y)
    predict_y = gbm.predict(test_X)

    recall = metrics.recall_score(test_y, predict_y)
    precision = metrics.precision_score(test_y, predict_y)
    accuracy = metrics.accuracy_score(test_y, predict_y)
    
    print("recall: %.2f, precision: %.2f, accuray: %.2f" % (recall, precision, accuracy))
    return test_y, predict_y, gbm

test_y, predict_y, gbm = func(data)

recall: 0.80, precision: 0.41, accuray: 0.84


In [10]:
# save whether charge prediction model
with open('charging_behavior\whether_to_charge\model_80train.pickle', 'wb') as f:
    pickle.dump(gbm, f)

# save StandardScaler model
with open('charging_behavior\whether_to_charge\StandardScaler.pickle', 'wb') as f:
    pickle.dump(scaler, f)