# Plot stop-to-stop segments with speed and delay

In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date

CONFIG_PATH = "./scripts/config.yml"



In [2]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [4]:
df = pd.read_parquet(
    f"{SEGMENT_GCS}{STOP_SEG_DICT['stop_delay_diagnostics']}_{analysis_date}.parquet", 
    columns = ["gtfs_dataset_key", "trip_id", 
               "max_time",
              ]
)

In [8]:
df = df.assign(
    date = pd.to_datetime(df.max_time).dt.date
)

df.date.value_counts()

2023-02-15    832134
Name: date, dtype: int64

In [21]:
df = dd.read_parquet(
    f"{SEGMENT_GCS}{STOP_SEG_DICT['stop_delay_diagnostics']}_{analysis_date}.parquet", 
    columns = ["gtfs_dataset_key", "_gtfs_dataset_name", "trip_id", 
               "shape_array_key", "feed_key", "name", 
               "stop_id", "stop_sequence", 
               "max_time",
               "actual_minus_scheduled_sec", "speed_mph",
              ]
)

In [40]:
def get_distribution(df):
    group_cols = ["gtfs_dataset_key", "trip_id"]
    col = "actual_minus_scheduled_sec"
    minimum = (df.groupby(group_cols)[col]
               .min()
               .reset_index()
               .rename(columns = {col: "min_delay"})
              )
    
    maximum = (df.groupby(group_cols)[col]
               .max()
               .reset_index()
               .rename(columns = {col: "max_delay"})
              )
    
    mean = (df.groupby(group_cols)[col]
               .mean()
               .reset_index()
               .rename(columns = {col: "mean_delay"})
              )
    
    stats = dd.merge(
        minimum,
        maximum,
        on = group_cols
    ).merge(
        mean, 
        on = group_cols
    ).compute()
    
    return stats

In [41]:
delay = get_distribution(df)

In [42]:
delay.shape

(24952, 5)

In [48]:
# over 1 hr delayed
for i in range(1, 12):
    # find how many trips have over 1, 2, 3 hr delay
    subset = delay[delay.max_delay >= 60*60*i]
    print(f"max_delay is over {i} hr: {len(subset)}")

max_delay is over 1 hr: 604
max_delay is over 2 hr: 173
max_delay is over 3 hr: 121
max_delay is over 4 hr: 81
max_delay is over 5 hr: 37
max_delay is over 6 hr: 13
max_delay is over 7 hr: 10
max_delay is over 8 hr: 8
max_delay is over 9 hr: 4
max_delay is over 10 hr: 1
max_delay is over 11 hr: 1


In [10]:
def get_trip_departure_hour(df: dd.DataFrame) -> dd.DataFrame:
    """
    In the future, we would want to get trip departure hour based
    off of scheduled trips, not RT.
    """
    trip_cols = ["gtfs_dataset_key", "trip_id"]
    timestamp_col = "max_time"
    
    trip_df = (df.groupby(trip_cols)
               [timestamp_col].min().dt.hour
               .reset_index()
               .compute()
              ).rename(columns = {timestamp_col: "departure_hour"})
    
    return trip_df

In [11]:
trip_departure = get_trip_departure_hour(df)

df2 = df.merge(
    trip_departure,
    on = ["gtfs_dataset_key", "trip_id"],
    how = "inner"
).drop(columns = "max_time")

In [12]:
stop_cols = [
    "gtfs_dataset_key", "_gtfs_dataset_name", 
    #"shape_array_key", 
    "feed_key", "name", 
    "stop_id", "stop_sequence", "departure_hour", 
]

stop_metrics = (df2.groupby(stop_cols)
                .agg({
                    "actual_minus_scheduled_sec": "mean", 
                    "speed_mph": "mean",
                }).reset_index()
               )

stop_metrics = stop_metrics.assign(
    actual_minus_scheduled_min = stop_metrics.actual_minus_scheduled_sec.divide(60).round(1),
)

In [13]:
stops = helpers.import_scheduled_stops(
    analysis_date, 
    columns = ["feed_key", "stop_id", "geometry"],
)

stop_metrics_with_geom = gtfs_schedule_wrangling.attach_stop_geometry(
    stop_metrics, 
    stops,
).compute()


stop_metrics_with_geom = gpd.GeoDataFrame(stop_metrics_with_geom)

In [14]:
one_operator = "Big Blue Bus VehiclePositions"

In [15]:
gdf = stop_metrics_with_geom[
    stop_metrics_with_geom._gtfs_dataset_name==one_operator]

In [16]:
gdf.explore(
    "actual_minus_scheduled_min",
    tiles = "CartoDB Positron"
)

In [17]:
import altair as alt

In [18]:
def make_chart(df, stop):
    df = df.drop(columns = "geometry")
    chart = (
        alt.Chart(df[df.stop_id==stop])
        .mark_bar()
        .encode(
            x=alt.X("departure_hour:Q", title="Hour"),
            y=alt.Y("actual_minus_scheduled_min:Q", 
                    title="Minutes Delayed"),
        )
    )
    
    return chart

In [None]:
for i in gdf.stop_id.unique()[:10]:
    chart = make_chart(gdf, i)
    display(chart)