In [None]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString

# Base directory containing GTFS feeds
base_dir = "gtfs"

# Get all subdirectories in base directory
sub_dirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

# Lists to store processed stops and routes
stops_features = []
routes_features = []

for sub_dir in sub_dirs:
    feed_name = os.path.basename(sub_dir)
    print(f"Processing feed: {feed_name}")

    # ---------------------
    # Load routes.txt (route metadata)
    # ---------------------
    routes_file = os.path.join(sub_dir, "routes.txt")
    if os.path.exists(routes_file):
        routes_df = pd.read_csv(routes_file, usecols=["route_id", "route_color"], dtype={"route_id": str, "route_color": str})
        routes_df["route_color"] = routes_df["route_color"].fillna("000000").apply(lambda x: f"#{x.zfill(6)}")  # Ensure HEX format
    else:
        print(f"Missing routes.txt in {sub_dir}")
        routes_df = pd.DataFrame(columns=["route_id", "route_color"])

    # ---------------------
    # Load trips.txt (trip_id → route_id mapping)
    # ---------------------
    trips_file = os.path.join(sub_dir, "trips.txt")
    if os.path.exists(trips_file):
        trips_df = pd.read_csv(trips_file, usecols=["trip_id", "route_id", "shape_id"], dtype={"trip_id": str, "route_id": str, "shape_id": str}).dropna()
        #trips_df = trips_df.drop_duplicates(subset=["shape_id", "route_id"])
    else:
        print(f"Missing trips.txt in {sub_dir}")
        trips_df = pd.DataFrame(columns=["trip_id", "route_id", "shape_id"])

    # ---------------------
    # Process Routes (shapes.txt)
    # ---------------------
    shapes_file = os.path.join(sub_dir, "shapes.txt")
    if os.path.exists(shapes_file):
        shapes_df = pd.read_csv(shapes_file, usecols=["shape_id", "shape_pt_lat", "shape_pt_lon", "shape_pt_sequence"], dtype={"shape_id": str})

        # Merge with trips_df to get route metadata
        shapes_merged_df = shapes_df.merge(trips_df, on="shape_id", how="left")

        # Aggregate to create LineStrings
        shapes_grouped = shapes_merged_df.groupby("shape_id").agg({
            "shape_pt_lon": list,
            "shape_pt_lat": list,
            "route_id": "first"
        }).reset_index()

        # Convert grouped points into LineStrings
        shapes_grouped["geometry"] = shapes_grouped.apply(lambda row: LineString(zip(row["shape_pt_lon"], row["shape_pt_lat"])), axis=1)
        
        # Assign route colors
        shapes_grouped = shapes_grouped.merge(routes_df, on="route_id", how="left").fillna("#000000")
        
        # Add feed name
        shapes_grouped["feed"] = feed_name

        # Convert to dictionary format
        routes_features.extend(shapes_grouped.to_dict(orient="records"))
    else:
        print(f"Missing shapes.txt in {sub_dir}")

    # ---------------------
    # Process Stops (stops.txt)
    # ---------------------
    stops_file = os.path.join(sub_dir, "stops.txt")
    if os.path.exists(stops_file):
        stops_df = pd.read_csv(stops_file, dtype={"stop_id": str, "stop_name": str, "stop_lat": float, "stop_lon": float})
    else:
        print(f"Missing stops.txt in {sub_dir}")
        continue

    # ---------------------
    # Load stop_times.txt (stop_id → trip_id mapping)
    # ---------------------
    stop_times_file = os.path.join(sub_dir, "stop_times.txt")
    if os.path.exists(stop_times_file):
        stop_times_df = pd.read_csv(stop_times_file, usecols=["stop_id", "trip_id"], dtype={"stop_id": str, "trip_id": str}).dropna()
    else:
        print(f"Missing stop_times.txt in {sub_dir}")
        continue

    # Merge stop_times with trips to get route_id per stop
    stop_routes_df = stop_times_df.merge(trips_df, on="trip_id", how="left").drop(columns=["trip_id"])

    # ---------------------
    # Aggregate all routes per stop
    # ---------------------
    stop_routes_grouped = stop_routes_df.groupby("stop_id")["route_id"].unique().reset_index()

    # Convert route_id list to a comma-separated string
    stop_routes_grouped["routes"] = stop_routes_grouped["route_id"].apply(
        lambda x: ",".join(sorted(map(str, x))) if pd.notnull(x).all() else ""
    )

    stop_routes_grouped = stop_routes_grouped.drop(columns=["route_id"])


    # Merge stops with route data
    stops_merged_df = stops_df.merge(stop_routes_grouped, on="stop_id", how="left").fillna("")

    # Assign route colors
    def get_primary_route_color(route_list):
        if not route_list:
            return "#000000"
        first_route = route_list.split(",")[0] if "," in route_list else route_list
        color = routes_df[routes_df["route_id"] == first_route]["route_color"].values
        return color[0] if len(color) > 0 else "#000000"

    stops_merged_df["route_color"] = stops_merged_df["routes"].apply(get_primary_route_color)

    # Convert to GeoJSON format
    stops_merged_df["geometry"] = stops_merged_df.apply(lambda row: Point(row["stop_lon"], row["stop_lat"]), axis=1)
    stops_merged_df["feed"] = feed_name

    # Convert to dictionary format
    stops_features.extend(stops_merged_df.to_dict(orient="records"))

# ---------------------
# Export GeoJSON files
# ---------------------
if routes_features:
    routes_gdf = gpd.GeoDataFrame(routes_features, geometry="geometry", crs="EPSG:4326")
    routes_gdf.to_file("routes.geojson", driver="GeoJSON")
    print("Created routes.geojson")
else:
    print("No route data found.")


In [None]:
trips_df.tail()
