In [1]:
import multiprocessing
import psutil
import sys

import geopandas as gpd
import shapely.geometry
import pandas as pd
import plotly.express as px


from railrailrail.network.stage import Stage
from railrailrail.railgraph import RailGraph

from analysis.analysis_helpers import get_stage_journeys

# Do not truncate DataFrames
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

minimum_available_ram_GB = 3
if psutil.virtual_memory().available < minimum_available_ram_GB * 1e9:
    print(
        "Warning: Less than %dGB RAM available. This program may crash with an Out-Of-Memory error."
        % minimum_available_ram_GB,
        file=sys.stderr,
    )


def make_stage_journeys_dataframe(stage: str, journeys: dict):
    df = (
        pd.DataFrame.from_dict(
            journeys,
            orient="index",
            columns=[
                "path_distance",
                "haversine_distance",
                "nodes",
                "edges",
                "costs",
                "total_cost",
            ],
        )
        .reset_index()
        .rename(columns={"index": "journey"})
    )

    df[["start", "end"]] = pd.DataFrame(df["journey"].tolist(), index=df.index)
    df.drop(columns=["journey"], inplace=True)

    df["circuity"] = df["path_distance"] / df["haversine_distance"]

    df.sort_values(
        by=["circuity", "haversine_distance", "path_distance", "start", "end"],
        inplace=True,
    )

    df.name = stage
    return df


def get_mean_distances_per_station(
    df: pd.DataFrame, rail_graph: RailGraph
) -> pd.DataFrame:
    df = df.query(
        "total_cost != 0"
    )  # Remove trips between stations from the same interchange.
    df = (
        df.groupby("start")
        .agg(
            {"circuity": "mean", "path_distance": "mean", "haversine_distance": "mean"}
        )
        .reset_index(0)
    )
    df.rename(
        columns={
            "path_distance": "mean_path_distance",
            "haversine_distance": "mean_haversine_distance",
            "circuity": "mean_circuity",
        },
        inplace=True,
    )
    df["station_name"] = df["start"].apply(
        lambda station_code: rail_graph.station_code_to_station[
            station_code
        ].station_name
    )
    df = df[
        [
            "start",
            "station_name",
            "mean_path_distance",
            "mean_haversine_distance",
            "mean_circuity",
        ]
    ]
    return df


def contains_point(
    polygon: shapely.geometry.polygon.Polygon, lat: float, lon: float
) -> bool:
    return polygon.contains(shapely.geometry.Point(lon, lat))


planning_areas = gpd.read_file("MasterPlan2019PlanningAreaBoundaryNoSea.geojson")
planning_areas["Planning Area"] = planning_areas["Description"].apply(
    lambda description: description.split("<td>", 1)[-1].split("</td>", 1)[0].strip()
)


stages = list(Stage.stages)
journeys_by_stage: dict[str, dict[str, pd.DataFrame | RailGraph]] = dict()
mean_distances_per_station_by_stage: dict[str, pd.DataFrame] = dict()


with multiprocessing.get_context("fork").Pool(
    None
) as pool:  # https://bugs.python.org/issue33725 https://stackoverflow.com/a/72605750
    for stage, journeys, rail_graph in pool.imap_unordered(get_stage_journeys, stages):
        journeys_by_stage[stage] = {
            "df": make_stage_journeys_dataframe(stage, journeys),
            "rail_graph": rail_graph,
        }
        mean_distances_per_station_by_stage[stage] = get_mean_distances_per_station(
            df=journeys_by_stage[stage]["df"],
            rail_graph=journeys_by_stage[stage]["rail_graph"],
        )
        # Stations within interchanges have different geographic coordinates.
        # This causes them to have different mean circuity. For example, see BP1/NS4/JS1 Choa Chu Kang.

        mean_distances_per_station_by_stage[stage][
            "planning_area"
        ] = mean_distances_per_station_by_stage[stage]["start"].apply(
            lambda start: next(
                (
                    feature["Planning Area"]
                    for _, feature in planning_areas.iterrows()
                    if contains_point(
                        feature["geometry"],
                        rail_graph.station_coordinates[start].latitude,
                        rail_graph.station_coordinates[start].longitude,
                    )
                ),
                None,
            )
        )

In [2]:
planning_area_to_region = {
    "ANG MO KIO": "NORTH-EAST",
    "BEDOK": "EAST",
    "BISHAN": "CENTRAL",
    "BOON LAY": "WEST",
    "BUKIT BATOK": "WEST",
    "BUKIT MERAH": "CENTRAL",
    "BUKIT PANJANG": "WEST",
    "BUKIT TIMAH": "CENTRAL",
    "CENTRAL WATER CATCHMENT": "NORTH",
    "CHANGI": "EAST",
    "CHANGI BAY": "EAST",
    "CHOA CHU KANG": "WEST",
    "CLEMENTI": "WEST",
    "DOWNTOWN CORE": "CENTRAL",
    "GEYLANG": "CENTRAL",
    "HOUGANG": "NORTH-EAST",
    "JURONG EAST": "WEST",
    "JURONG WEST": "WEST",
    "KALLANG": "CENTRAL",
    "LIM CHU KANG": "NORTH",
    "MANDAI": "NORTH",
    "MARINA EAST": "CENTRAL",
    "MARINA SOUTH": "CENTRAL",
    "MARINE PARADE": "CENTRAL",
    "MUSEUM": "CENTRAL",
    "NEWTON": "CENTRAL",
    "NORTH-EASTERN ISLANDS": "NORTH-EAST",
    "NOVENA": "CENTRAL",
    "ORCHARD": "CENTRAL",
    "OUTRAM": "CENTRAL",
    "PASIR RIS": "EAST",
    "PAYA LEBAR": "EAST",
    "PIONEER": "WEST",
    "PUNGGOL": "NORTH-EAST",
    "QUEENSTOWN": "CENTRAL",
    "RIVER VALLEY": "CENTRAL",
    "ROCHOR": "CENTRAL",
    "SELETAR": "NORTH-EAST",
    "SEMBAWANG": "NORTH",
    "SENGKANG": "NORTH-EAST",
    "SERANGOON": "NORTH-EAST",
    "SIMPANG": "NORTH",
    "SINGAPORE RIVER": "CENTRAL",
    "SOUTHERN ISLANDS": "CENTRAL",
    "STRAITS VIEW": "CENTRAL",
    "SUNGEI KADUT": "NORTH",
    "TAMPINES": "EAST",
    "TANGLIN": "CENTRAL",
    "TENGAH": "WEST",
    "TOA PAYOH": "CENTRAL",
    "TUAS": "WEST",
    "WESTERN ISLANDS": "WEST",
    "WESTERN WATER CATCHMENT": "WEST",
    "WOODLANDS": "NORTH",
    "YISHUN": "NORTH",
}
planning_area_to_region_df = (
    pd.DataFrame.from_dict(planning_area_to_region, orient="index", columns=["region"])
    .reset_index(0)
    .rename(columns={"index": "planning_area"})
)

In [9]:
kdf = (
    mean_distances_per_station_by_stage["tel_4"]
    .groupby(by="planning_area")
    .mean(numeric_only=True)
    .reset_index(0)
    .sort_values(by=["mean_path_distance"])
).merge(planning_area_to_region_df, on="planning_area", how="left")
print("Number of active planning areas:", len(kdf["planning_area"]))
print(kdf.head(5))

fig = px.scatter(
    kdf,
    title="Average Circuity against Average Path Distance to all other stations (Averaged By Planning Area)",
    x="mean_path_distance",
    y="mean_circuity",
    labels={
        "mean_path_distance": "Average Path Distance",
        "mean_circuity": "Average Circuity",
        "planning_area": "Planning Area",
        "region": "Region",
    },
    size=None,
    color="region",
    color_discrete_map={
        "CENTRAL": "#ffa15a",
        "WEST": "#636efa",
        "EAST": "#00cc96",
        "NORTH-EAST": "fuchsia",
        "NORTH": "#ef553b",
    },
    text="planning_area",
    hover_name="planning_area",
    log_x=True,
    log_y=False,
    size_max=10,
    width=1500,
    height=800,
)
fig.update_layout(title={"x": 0.5}, xaxis=dict(tickformat="", ticksuffix="m"))
fig.update_traces(textposition="top center", textfont_size=10)
fig.show()

Number of active planning areas: 41
     planning_area  mean_path_distance  mean_haversine_distance  \
0           NEWTON        10001.637216              7973.126369   
1           ROCHOR        10194.051959              8085.013272   
2           MUSEUM        10260.895622              8313.172155   
3          ORCHARD        10615.232975              8295.278307   
4  SINGAPORE RIVER        10730.741859              8656.832768   

   mean_circuity   region  
0       1.288694  CENTRAL  
1       1.337619  CENTRAL  
2       1.268227  CENTRAL  
3       1.313227  CENTRAL  
4       1.307112  CENTRAL  


In [None]:
mean_distances_per_station_by_stage["bplrt"]