In [43]:
import pandas as pd
import datetime
import sys
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
sys.path.append('..')
from beb_chargers.gtfs_beb import GTFSData, to_datetime_safe

# Updated overview
## Connecting realtime data to trip IDs


## On-Time Performance

## Energy Consumption

# Testing on April 1st Data

In [5]:
# Read in realtime data provided by Zack
rt_df = pd.read_pickle('../beb_chargers/data/realtime/metro/2024_04_01.pkl').reset_index(drop=True)
rt_df = rt_df.astype(
    dtype={
        'vehicle_id': str,
        'scheduleDeviation': int
    }
)
# Convert time column to datetime and change time zone
rt_df['locationtime'] = pd.to_datetime(
    rt_df['locationtime'].astype(int), unit='s', utc=True).dt.tz_convert('US/Pacific')

In [6]:
# Read in vehicle performance data downloaded from ChargePoint
vc_df = pd.read_csv('../beb_chargers/data/viriciti/apr1data_v1.csv')
vc_df = vc_df.astype(
    dtype={
        'Name': str
    }
)
# Convert time column to datetime and make sure it's the right time zone
vc_df['ISO Time'] = pd.to_datetime(vc_df['ISO Time']).dt.tz_convert('US/Pacific')

# Exclude ChargePoint columns that are always NaN
na_sums = vc_df.isna().sum()
used_cols = [c for c in vc_df.columns if na_sums[c] != len(vc_df)]
vc_df = vc_df[used_cols]

In [7]:
# Filter down realtime data to only include buses in ChargePoint data
vids = vc_df['Name'].unique()
rt_df = rt_df[rt_df['vehicle_id'].isin(vids)]

In [8]:
# Read in static GTFS data
gtfs = GTFSData.from_dir('../beb_chargers/data/gtfs/metro_mar24')
gtfs.trips_df = gtfs.trips_df.astype({'trip_id': str, 'route_id': str})

### Connect vehicle number and time to trips

In [10]:
trip_gb = rt_df.sort_values(by='locationtime').groupby(['vehicle_id', 'trip_id'])
trip_times_df = trip_gb['locationtime'].min().rename('min_time').to_frame()
trip_times_df['max_time'] = trip_gb['locationtime'].max()
trip_times_df = trip_times_df.sort_values(by=['vehicle_id', 'min_time'])
# trip_times_df['pseudo_max_time'] = trip_gb['locationtime'].min().shift(-1)

In [11]:
# pseudo_max_time is start time of next trip, if there is a next trip
trip_times_df['pseudo_max_time'] = trip_times_df.groupby('vehicle_id')['min_time'].shift(-1)
trip_times_df['pseudo_max_time'] = trip_times_df['pseudo_max_time'].fillna(trip_times_df['max_time'])

Okay, now we have a nice list of start and end times (including a "cleaned" version of end times). We want to use this to map all `(vehicle_id, datetime)` tuples to the approporiate trip.

In [12]:
trip_times_df

In [14]:
time_to_trip_id = trip_times_df.reset_index()

In [15]:
time_to_trip_id[
    (time_to_trip_id['vehicle_id'] == vid_test)
    & (time_to_trip_id['min_time'] <= dt_test)
]

In [None]:
### Test for a single vehicle

In [16]:
trip_times_df

In [17]:
rt_4700 = rt_df[rt_df['vehicle_id'] == '4700']
vc_4700 = vc_df[vc_df['Name'] == '4700']
times_4700 = trip_times_df.reset_index()
times_4700 = times_4700[times_4700['vehicle_id'] == '4700']

Can we do this backwards relative to my original plan?

In [18]:
times_4700 = times_4700.set_index('trip_id').sort_values(by='min_time')

In [19]:
trip_times_df

In [20]:
vids = list()
tids = list()
kwhs = list()
mis = list()
cons = list()
for (vid, tid) in trip_times_df.index:
    # print(vid, tid)
    vid_times = trip_times_df.reset_index()
    vid_times = vid_times[vid_times['vehicle_id'] == vid].set_index('trip_id')
    # print(vid_times)
    vid_vc = vc_df[vc_df['Name'] == vid]
    # print('Trip {}'.format(tid))
    vids.append(vid)
    tids.append(tid)
    tid_df = vid_vc[vid_vc['ISO Time'].between(
        vid_times.loc[tid, 'min_time'],
        vid_times.loc[tid, 'max_time']
    )]
    kwh_used = tid_df['Energy used (kWh)'].max() - tid_df['Energy used (kWh)'].min()
    kwhs.append(kwh_used)
    mi_driven = tid_df['Distance driven (mi)'].max() - tid_df['Distance driven (mi)'].min()
    mis.append(mi_driven)
    cons.append(kwh_used / mi_driven)
    # print('\tkWh used: {:.2f}'.format(
    #     kwh_used
    #     )
    # )
    # print('\tDistance driven: {:.2f}'.format(
    #     mi_driven
    #     )
    # )
    # print('\tEnergy consumption rate: {:.2f}'.format(
    #     kwh_used / mi_driven
    #     )
    # )


In [21]:
df_by_trip = pd.DataFrame(
    data={
        'vehicle_id': vids,
        'trip_id': tids,
        'kwh': kwhs,
        'miles': mis,
        'kwh_per_mi': cons
    }
)

In [22]:
# Let's add route info!
df_by_trip = df_by_trip.merge(gtfs.trips_df[['trip_id', 'route_id']], on='trip_id')

In [23]:
df_by_trip['veh_type'] = df_by_trip['vehicle_id'].str[:2]

In [24]:
df_by_trip

In [25]:
veh_route_df = df_by_trip.groupby(['veh_type', 'route_id'])[['kwh', 'miles']].sum()
veh_route_df['kwh_per_mi'] = veh_route_df['kwh'] / veh_route_df['miles']

In [27]:
veh_route_df = veh_route_df.reset_index().merge(gtfs.routes_df.reset_index()[['route_id', 'route_short_name']].astype({'route_id': str}),
                                 on='route_id').set_index(['veh_type', 'route_short_name'])

In [33]:
bus_type_df = df_by_trip.groupby('veh_type')[['kwh', 'miles']].sum()
bus_type_df['kwh_per_mile'] = bus_type_df['kwh'] / bus_type_df['miles']
bus_type_df

In [35]:
veh_summary = df_by_trip.groupby('vehicle_id')[['kwh', 'miles']].sum()
veh_summary['kwh_per_mi'] = veh_summary['kwh'] / veh_summary['miles']

In [36]:
len(veh_summary.index.tolist())

In [38]:
veh_summary

In [40]:
plt.bar(x=veh_summary.index.tolist(), height=veh_summary['kwh_per_mi'].to_numpy())
plt.xticks(rotation=90)
plt.show()

In [58]:
fig = go.Figure()
for vtype, gp in df_by_trip.groupby('veh_type'):
    fig.add_trace(go.Histogram(x=gp['kwh_per_mi'], name=vtype))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()


## Schedule deviations: how can we get these for each trip?
For each trip, identify the row with the minimum time (i.e., the start of the trip). Take the corresponding `scheduleDeviation` value.

In [59]:
trip_min_times = rt_df.groupby('trip_id')['locationtime'].min()
# rt_df.set_index(['trip_id', 'locationtime'])

In [60]:
trip_min_times

In [61]:
trip_delays = trip_min_times.to_frame().reset_index().merge(
    rt_df[['trip_id', 'locationtime', 'scheduleDeviation']], on=['trip_id', 'locationtime']
)
trip_delays['scheduled_start_inferred'] = trip_delays['locationtime'] - pd.to_timedelta(
    trip_delays['scheduleDeviation'], unit='s'
)

In [65]:
trip_delays.head()

In [66]:
gtfs.stop_times_df['arrival_time'] = to_datetime_safe(
    gtfs.stop_times_df['arrival_time'], datetime.datetime(2024, 4, 1)
)

In [67]:
gtfs.stop_times_df['trip_id'] = gtfs.stop_times_df['trip_id'].astype(str)

In [68]:
min_stop_times = gtfs.stop_times_df.groupby('trip_id')['arrival_time'].min()

In [69]:
trip_delays = trip_delays.merge(
    min_stop_times.reset_index(), on='trip_id'
).rename(columns={'arrival_time': 'scheduled_start_gtfs'})

In [70]:
trip_delays = trip_delays.set_index('trip_id')

In [71]:
rt_df.set_index('trip_id').loc['664463016']

In [72]:
start_diffs = trip_delays['scheduled_start_inferred'] - trip_delays['scheduled_start_gtfs'].dt.tz_localize('US/Pacific')
start_diffs

In [73]:
start_diffs[abs(start_diffs.dt.total_seconds()) > 300].dt.total_seconds()

**What have we learned from this?**
The scheduleDeviation value, combined with the trip start time, is usually well aligned with the static GTFS data. The exceptions to this trend point to some clear outliers -- for example, when there is a random blip in the data where we have a single point recorded way too early for a trip (which may cause problems elsewhere when we try to detect when trips start/end), or only a single point for the whole trip!

**What should we do about it?**
I think we should do some early cleanup for every trip, removing points that seem like they don't belong as part of that trip (maybe even reassigning them to another trip?)

# Cleaning up trip IDs in realtime data

In [74]:
rt_cleaned = rt_df.copy()

In [75]:
# Pick a single trip for testing
all_tids = rt_df['trip_id'].unique()
offset = 0
problem_dfs = list()
full_bad_dfs = list()
for tid in all_tids[offset:offset+10]:
    trip_df = rt_df[rt_df['trip_id'] == tid].sort_values(by='locationtime')
    trip_df['next_time'] = trip_df['locationtime'].shift(-1)
    trip_df['gap_to_next'] = (trip_df['next_time'] - trip_df['locationtime']).dt.total_seconds()
    bad_times = trip_df[trip_df['gap_to_next'] > 300]['locationtime']
    if len(bad_times) > 0:
        full_bad_dfs.append(trip_df)
    vid = trip_df['vehicle_id'].unique()[0]
    for bt in bad_times:
        # TODO: this isn't always getting at the issue; often we're just identifying a change in trips
        problem_dfs.append(get_time_neighborhood(rt_df, vid, bt - datetime.timedelta(minutes=2)))

In [394]:
len(full_bad_dfs)

In [380]:
len(problem_dfs)

This still isn't giving us the full picture, because we need all trips done by the vehicle.

In [397]:
full_bad_dfs[0][full_bad_dfs[0]['gap_to_next'] > 300]

### Why haven't I been trying to use `nextStop` all along??
I think that pulling in the next stop information can help us identify when a trip has been included mistakenly.

In [76]:
stop_times_df = pd.read_csv('../beb_chargers/data/gtfs/metro_mar24/stop_times.txt')
stop_times_df = stop_times_df.astype(
    {'trip_id': str, 'stop_id': str}
)

In [77]:
rt_df = rt_df.merge(
    stop_times_df[['trip_id', 'stop_id', 'arrival_time', 'departure_time']],
    right_on=['trip_id', 'stop_id'], left_on=['trip_id', 'nextStop']
)

In [78]:
rt_df[rt_df['vehicle_id'] == '4715'].sort_values(by='locationtime')

In [80]:
def get_time_neighborhood(realtime_df, vehicle_id, time_cutoff):
    return realtime_df[
        (realtime_df['vehicle_id'] == vehicle_id) & (
            realtime_df['locationtime'] >= time_cutoff)].head(10)
    

In [81]:
get_time_neighborhood(rt_df, '4703', pd.to_datetime('2024-04-01 05:09:30-07:00'))

**Documenting issues found**
- Trip 635436106: one extra measurement an hour and ahalf after trip has seemingly ended, when there is already 2 hours of downtime between trips (`get_time_neighborhood(rt_df, '4703', pd.to_datetime('2024-04-01 05:09:30-07:00'))`)

In [316]:
trips_gb = rt_cleaned.groupby('trip_id')
for tid, trip_df in trips_gb:
    

One problem... what we really want to model is trip duration, not delay at the start of the next trip. This might be challenging with the arbitrary nature of trip_id value changes.

In [194]:
trip_delays['scheduleDeviation'].describe()

### Up next: bring in GTFS schedule data to get trip details

In [133]:
gtfs = GTFSData.from_dir('../beb_chargers/data/gtfs/metro_mar24')

## Testing XALT

In [229]:
xalt_df = pd.read_csv('../beb_chargers/data/viriciti/xalt_params_test.csv')

In [235]:
xalt_na_sums = xalt_df.isna().sum()
xalt_used_cols = [c for c in xalt_df.columns if xalt_na_sums[c] != len(xalt_df)]
xalt_df = xalt_df[xalt_used_cols]
xalt_df['ISO Time'] = pd.to_datetime(
        xalt_df['ISO Time']).dt.tz_convert('US/Pacific')
xalt_df = xalt_df.set_index('ISO Time')

In [246]:
xalt_df[xalt_df['Name'] == 4700]['AUX INVERTER POWER (kW)'].sort_index().resample('1min').mean().plot()

In [255]:
# xalt_df[xalt_df['Name'] == 4700]['BMU State of Charge'].sort_index().resample('1min').mean().plot(label='SOC')
# xalt_df[xalt_df['Name'] == 4700]['BMU Total Battery Voltage'].sort_index().resample('1min').mean().plot(label='Voltage')
# xalt_df[xalt_df['Name'] == 4700]['BMU Total Battery Current'].sort_index().resample('1min').mean().plot(label='Current')
xalt_df[xalt_df['Name'] == 4700]['Ambient Temperature'].sort_index().resample('1min').mean().plot(label='Temperature')
plt.legend()

In [248]:
xalt_used_cols

In [256]:
vc_df

# Designing the next ViriCiti downloads

### Which data fields do we want to include?
- distance driven per day
- filtered SoC
- energy used in service per day
- energy used per day
- energy used not in service per day
- energy consumed per day (not yet sure what the difference is)
- energy charged per day
- driven time per day

### What time range should be covered?
- To start, let's try all the dates Zack shared realtime data from

### What frequency should we use?
- Let's try 10 seconds for now

In [261]:
vc_df[vc_df['Name']=='4700'].sort_values(by='ISO Time')

# Testing on all data

In [166]:
prefix = '../beb_chargers/data/realtime/kcm_realtime_new'
rt_files = sorted(os.listdir(prefix))
rt_dfs = list()
for pkl_name in rt_files:
    temp_df = pd.read_pickle(prefix + '/' + pkl_name).reset_index(drop=True)
    temp_df = temp_df.astype(
        dtype={
            'vehicle_id': str,
            'scheduleDeviation': int
        }
    )
    # Convert time column to datetime and change time zone
    temp_df['locationtime'] = pd.to_datetime(
        temp_df['locationtime'].astype(int), unit='s', utc=True).dt.tz_convert('US/Pacific')
    rt_dfs.append(temp_df)
full_rt_df = pd.concat(rt_dfs)

In [180]:
rt_files

In [181]:
used_cols

In [173]:
full_rt_df.head()

In [169]:
full_rt_df = full_rt_df[full_rt_df['vehicle_id'].isin(vids)]

In [170]:
len(full_rt_df)

In [179]:
full_rt_df.groupby('trip_id')['vehicle_id'].nunique().sort_values()