In [1]:
import pandas as pd

In [None]:
def find_potential_ce(traj_df, with_status=True):
    logging.info('Finding pce')

    traj_df = traj_df.merge(cs_ce_info['max_distance'] * 1.1, left_on='s_idx', right_index=True, how='left')

    traj_df['in_range'] = traj_df['s_dis'] < 200

    traj_df['grp'] = ((traj_df['occupied_from_od'] != traj_df['occupied_from_od'].shift())
                      | (traj_df.license != traj_df.license.shift())
                      | (traj_df.in_range != traj_df.in_range.shift())
                      | (traj_df['in_range'] & (traj_df['s_idx'] != traj_df['s_idx'].shift()))).cumsum()

    df_pce = pd.DataFrame({
        'license': traj_df.groupby('grp')['license'].first(),
        'begin_time': traj_df.groupby('grp')['timestamp'].first(),
        'begin_time_index': traj_df.groupby('grp').apply(lambda x: x.index[0]),
        'end_time': traj_df.groupby('grp')['timestamp'].last(),
        'end_time_index': traj_df.groupby('grp').apply(lambda x: x.index[-1]),
        'charging_duration': traj_df.groupby('grp')['timestamp'].last() - traj_df.groupby('grp')['timestamp'].first(),
        'consecutive': traj_df.groupby('grp').size(),
        'grp': traj_df.groupby('grp')['grp'].first(),
        's_idx': traj_df.groupby('grp')['s_idx'].first(),
        'in_range': traj_df.groupby('grp')['in_range'].first(),
        'status': traj_df.groupby('grp')['occupied_from_od'].first(),
        'valid': traj_df.groupby('grp')['valid'].all(),
    }).reset_index(drop=True)

    return df_pce


def filter_pce(t_df_pce, with_status=True):
    logging.info('Filtering pce')

    # 给出pce是否为ce的标记
    t_df_pce['ce'] = (t_df_pce['in_range']
                      & (~t_df_pce['status'])
                      & (t_df_pce['charging_duration'] > datetime.timedelta(minutes=30))
                      )

    t_df_pce['valid'] = t_df_pce['ce'] | t_df_pce['valid']

    t_df_pce['grp_end_flag'] = (t_df_pce['license'] != t_df_pce['license'].shift(-1))
    t_df_pce.loc[t_df_pce['ce'] == True, 'grp_end_flag'] = True

    t_df_pce['with_pre_grp'] = t_df_pce.loc[::-1, 'grp_end_flag'].cumsum()[::-1]

    df_result = pd.DataFrame({
        'license': t_df_pce.groupby('with_pre_grp', sort=False)['license'].last(),
        'begin_time': t_df_pce.groupby('with_pre_grp', sort=False)['begin_time'].last(),
        'begin_time_index': t_df_pce.groupby('with_pre_grp', sort=False)['begin_time_index'].last().astype(int),
        'end_time': t_df_pce.groupby('with_pre_grp', sort=False)['end_time'].last(),
        'end_time_index': t_df_pce.groupby('with_pre_grp', sort=False)['end_time_index'].last().astype(int),
        'charging_duration': t_df_pce.groupby('with_pre_grp', sort=False)['s_idx'].last(),
        'grp': t_df_pce.groupby('with_pre_grp', sort=False)['grp'].last(),
        'ce': t_df_pce.groupby('with_pre_grp', sort=False)['ce'].last(),
        's_idx': t_df_pce.groupby('with_pre_grp', sort=False)['s_idx'].last(),
        'valid': t_df_pce.groupby('with_pre_grp', sort=False)['valid'].all(),
    })

    df_result = df_result.loc[df_result['ce']]
    df_result.drop(['ce', 'grp'], axis=1, inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    return df_result

cs = pd.read_csv('data/charging_station/ChargeLocation201706_wgs84.csv')
trajectories = pd.read_parquet('data/trajectory/trajectories_w_statuses')
print('Data loaded')
df_pce = find_potential_ce(trajectories)
print('PCE extracted')
result = filter_pce(df_pce)
print('Filtered')
result.to_parquet('data/charging_event/charging_event_30min_200m_79station.parquet')