In [None]:
import os
from pathlib import Path

from cryptography.fernet import Fernet
import json

import numpy as np
import pandas as pd
import geopandas as gpd
import zipfile

import plotly.express as px

# Transit comparison

Stop-to-stop flows for transit relations are available from a traffic model for the city of Munich. They are located in `transit_reference.json.enc`, but only accessible if you have gained access. To make use of the data, put a file called `encryption_key` into this folder to be able to decrypt the reference data. You can obtain the encryption key from TU Munich.

In [None]:
if not os.path.exists("encryption_key"):
    raise RuntimeError("The encryption key is not accessible")

## Settings

In [None]:
input_path = Path("/home/shoerl/tum/output")
input_sampling_factor = 0.01

In [None]:
# Check GTFS data exists
gtfs_path = input_path / "latest.zip"
assert os.path.exists(gtfs_path)

# check stop traversal data from simulation
assert os.path.exists(input_path / "stop_traversals.csv")

# check zone data
assert os.path.exists("zones.gpkg")

## Load data

In [None]:
# Load encryption key
with open("encryption_key", "rb") as f:
    encryption_key = f.read()

# Decrypt reference data
with open("transit_reference.json.enc", "rb") as f:
    reference = json.loads(Fernet(encryption_key).decrypt(f.read()).decode())

In [None]:
# Zoning data
df_zones = gpd.read_file("zones.gpkg")
df_zones = df_zones[df_zones["zone_id"] == "mvv"]

In [None]:
# Load an clean GTFS data
with zipfile.ZipFile(gtfs_path) as archive:
    with archive.open("stops.txt") as f:
        df_stops = pd.read_csv(f, sep = ",")

df_stops["geometry"] = gpd.points_from_xy(df_stops["stop_lon"], df_stops["stop_lat"])
df_stops = gpd.GeoDataFrame(df_stops, crs = "EPSG:4326").to_crs("EPSG:25832")
df_stops = gpd.sjoin(df_stops, df_zones, predicate = "within")

In [None]:
# Load leg data
df_simulation = pd.read_csv(input_path / "stop_traversals.csv", sep = ";")

## Match stop identifiers

In [None]:
names = set([r["from"] for r in reference] + [r["to"] for r in reference])

area_mapping = {}

for name in names:
    f = df_stops["stop_name"] == name
    f &= ~df_stops["parent_station"].isna()

    if np.count_nonzero(f) == 0:
        print("No match for", name)

    else:
        parents = df_stops.loc[f, "parent_station"].unique()

        if len(parents) > 1:
            print("Multiple parents for", name, ":", parents)
        else:
            area_mapping[name] = parents[0]

In [None]:
for relation in reference:
    relation["from_area_id"] = area_mapping[relation["from"]]
    relation["to_area_id"] = area_mapping[relation["to"]]

## Perform comparison

In [None]:
# Build comparison data frame
df_comparison = []

for slot in ["total", "at7", "at8", "at16", "at17"]:
    for relation in reference:
        df_from = df_simulation[df_simulation["area_id"] == float(relation["from_area_id"])]
        df_to = df_simulation[df_simulation["area_id"] == float(relation["to_area_id"])]

        if slot.startswith("at"):
            hour = int(slot.replace("at", ""))
            df_from = df_from[df_from["departure_time"] >= hour * 3600]
            df_to = df_to[df_to["arrival_time"] < (hour + 1) * 3600]

        df_joint = pd.merge(df_from, df_to, on = ["person_id", "trip_index", "leg_index"])

        simulation = len(df_joint) / input_sampling_factor
        
        df_comparison.append({
            "relation": "{} - {}".format(relation["from"], relation["to"]),
            "reference": relation[slot], "simulation": simulation, 
            "type": relation["type"], "slot": slot
        })

df_comparison = pd.DataFrame.from_records(df_comparison)
df_comparison["simulation"] = df_comparison["simulation"].astype(int)

df_comparison["delta"] = (df_comparison["simulation"] - df_comparison["reference"]) / df_comparison["reference"]

In [None]:
# Show comparison table
df_comparison

In [None]:
# Plot comparison data
px.scatter(df_comparison, x = "reference", y = "simulation", color = "type")