In [43]:
import time
import urllib.request
from datetime import datetime
import os

from google.transit import gtfs_realtime_pb2

import pandas as pd

In [29]:
SHINJUKU_ROUTE_IDS = [
    "010",  # CH01
    "122",  # 白61
    "135",  # 宿74
    "138",  # 宿75
    "147",  # 早77
    "148",  # 王78
    "158",  # 宿91
    "169",  # 品97
]

In [33]:
# Helper to convert UNIX timestamp to readable format
def format_time(ts):
    return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") if ts else "N/A"

In [39]:
def process_file(fn: str) -> pd.DataFrame:
    feed = gtfs_realtime_pb2.FeedMessage()
    with open(fn, "rb") as f:
        feed.ParseFromString(f.read())

    df = pd.DataFrame(
        {
            "Vehicle ID": [],
            "Trip ID": [],
            "Route ID": [],
            "Direction": [],
            "Schedule Relationship": [],
            "Current Stop Sequence": [],
            "Current Stop ID": [],
            "Timestamp": [],
            "Location": [],
        }
    )

    # Parse vehicle entities
    for entity in feed.entity:
        if entity.HasField("vehicle"):
            v = entity.vehicle
            trip = v.trip
            position = v.position
            vehicle_info = v.vehicle

            if trip.route_id not in SHINJUKU_ROUTE_IDS:
                continue

            d = pd.DataFrame({
                "Vehicle ID": [vehicle_info.id],
                "Trip ID": [trip.trip_id],
                "Route ID": [trip.route_id],
                "Direction": [trip.direction_id],
                "Schedule Relationship": [trip.schedule_relationship],
                "Current Stop Sequence": [v.current_stop_sequence],
                "Current Stop ID": [v.stop_id],
                "Timestamp": [format_time(v.timestamp)],
                "Location": [f"{position.latitude} {position.longitude}"]
            })
            df = pd.concat([df, d], ignore_index=True)

    return df

In [49]:
def process_date(date: str) -> None:
    started = False
    stop = False
    df = pd.DataFrame(
            {
                "Vehicle ID": [],
                "Trip ID": [],
                "Route ID": [],
                "Direction": [],
                "Schedule Relationship": [],
                "Current Stop Sequence": [],
                "Current Stop ID": [],
                "Timestamp": [],
                "Location": [],
            }
        )
    for root, dirs, files in os.walk(".\\data", topdown=False):
        for name in files:
            if started and not name.startswith(date):
                stop = True
                break
            elif not started and name.startswith(date):
                started = True
            elif not started:
                continue
            
            df = pd.concat([df, process_file(os.path.join(root, name))], ignore_index=True)
        
        if stop:
            break

    df.to_csv(f"{date}.csv")

In [50]:
for i in range(9, 20):
    date = f"202504{str(i).zfill(2)}"
    process_date(date)
    print(f"Finished {date}")

Finished 20250418
Finished 20250419
