In [1]:
import pandas as pd
import requests
import os
import io
import py7zr
import zipfile
import csv
import tempfile
from pathlib import Path
from dotenv import load_dotenv
import shutil
import copy
from google.transit import gtfs_realtime_pb2
from google.protobuf.message import DecodeError
import logging

load_dotenv()
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")

In [2]:
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")

In [3]:
#Appel aux APIs
def call_koda_api(base_url, date, operator = "sl", endpoint=""):
    if endpoint != "":
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}/{endpoint}"
    else:
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}"
        
    params = {
        "date": date, 
        "key": KODA_KEY
    }
    
    request = requests.get(f"{api_url}", params=params, timeout=30)
    print(api_url)
    print(request)
    print(len(request.content), "BYTES")

    return request

def call_koda_history_api(date):
    request = call_koda_api("gtfs-rt", date, endpoint="TripUpdates")
    return request

def call_koda_reference_api(date):
    request = call_koda_api("gtfs-static", date)
    return request

In [4]:
#Ouvre les fichiers et lit par batch une journée d'historique
def read_koda_history_day(request, items_by_batch=400):
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
        datefmt="%H:%M:%S"
    )

    logger = logging.getLogger("gtfs")

    archive_bytes = io.BytesIO(request.content)
    
    # FICHIER TEMPORAIRES
    tmpdir = tempfile.mkdtemp(prefix="koda_")
    tmp = Path(tmpdir)

    history_entities = []
    bad_files = []

    BATCH = items_by_batch

    feed = gtfs_realtime_pb2.FeedMessage()

    # 1) liste des candidats
    archive_bytes.seek(0)
    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
        candidates = [n for n in z.getnames() if n.lower().endswith(".pb")]

    print("Nb fichiers .pb:", len(candidates))

    # 2) extraction + parse par batch
    for i in range(0, len(candidates), BATCH):
        batch = candidates[i:i+BATCH]

        logger.info(
            "Batch %d–%d / %d (%.1f%%)",
            i + 1,
            min(i + BATCH, len(candidates)),
            len(candidates),
            100 * (i + len(batch)) / len(candidates)
        )

        try:
            archive_bytes.seek(0)
            with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                z.extract(path=tmpdir, targets=batch)
        except Exception as e:
            # si un batch plante, on retombe en mode "un par un" juste pour ce batch
            for name in batch:
                try:
                    archive_bytes.seek(0)
                    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                        z.extract(path=tmpdir, targets=[name])
                except Exception as e2:
                    bad_files.append((name, f"ExtractError: {e2!r}"))
            continue

        for name in batch:
            p = tmp / name
            try:
                raw = p.read_bytes()
                feed.Clear()
                try:
                    feed.ParseFromString(raw)
                except DecodeError as de:
                    bad_files.append((name, f"DecodeError: {de!r}"))
                    continue

                for entity in feed.entity:
                    history_entities.append(copy.deepcopy(entity))

            except Exception as e:
                bad_files.append((name, f"Read/ParseError: {e!r}"))
            finally:
                p.unlink(missing_ok=True)

    shutil.rmtree(tmpdir, ignore_errors=True)

    print("✅ Total entités enregistrées:", len(history_entities))
    print("⚠️ Fichiers ignorés:", len(bad_files))
    print("Exemples:", bad_files[:5])

    return history_entities, bad_files

# Lit les fichier de référence .txt
def read_koda_reference_data(request, file_name):
    archive_bytes = io.BytesIO(request.content)

    with zipfile.ZipFile(archive_bytes, "r") as z:
        with z.open(f"{file_name}.txt") as f:
            text = io.TextIOWrapper(f, encoding="utf-8")
            reader = csv.DictReader(text)
            return list(reader)

In [5]:
# Crée une tableu de correspondance basé sur une colonne
def corr_array_creation(reference_data, id_key, ref_fields:tuple):
    """
    Création d'un tableau de correspondance pour gagner du temps au merge des dict().
    `reference_data`: ,
    `id_key`: nom de la column à mettre en avant,
    `ref_fields`: liste des colonnes à prendre. ('nom_1', "nom_2",...)
    """
    ref = {}
    for r in reference_data:
        tid = r.get(id_key)
        if not tid:
            continue
        # on stocke seulement ce dont on a besoin
        ref[tid] = {k: r.get(k) for k in ref_fields}
    
    return ref

In [6]:
#Choisi la structure de sortie
def flatten_history_entity_koda(history_items, trips_corr):
    # si pas de trip_update -> rien
    if not history_items.HasField("trip_update"):
        return  # stop (fonction generator: pas de yield)

    tu = history_items.trip_update
    trip = tu.trip
    tid = trip.trip_id

    # MERGE + FILTRE ici
    corr = trips_corr.get(tid)
    if corr is None:
        return  # trip_id pas choisi => on skip direct

    #route_id = corr.get("route_id")
    direction_id = corr.get("direction_id")

    start_date = trip.start_date if trip.HasField("start_date") else None
    feed_ts = tu.timestamp if tu.HasField("timestamp") else None
    vehicle_id = tu.vehicle.id if tu.HasField("vehicle") else None

    for stu in tu.stop_time_update:
        yield {
            "trip_id": tid,
            #"route_id": route_id,
            "direction_id": direction_id,
            "start_date": start_date,
            "vehicle_id": vehicle_id,
            "timestamp": feed_ts,
            "stop_sequence": stu.stop_sequence,
            "arrival_delay": stu.arrival.delay if stu.HasField("arrival") else None,
            "arrival_time": stu.arrival.time if stu.HasField("arrival") else None,
            "departure_delay": stu.departure.delay if stu.HasField("departure") else None,
            "departure_time": stu.departure.time if stu.HasField("departure") else None,
        }


---


In [None]:
# Execute les calls d'APIs
date_day = "2025-03-17"
r_history = call_koda_history_api(date_day)
r_reference = call_koda_reference_api(date_day)

https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-rt/sl/TripUpdates
<Response [200]>
70434472 BYTES
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-static/sl
<Response [200]>
52437883 BYTES


In [None]:
#Récupération des données
#Lit les données d'historique par batch
history_entities, bad_files = read_koda_history_day(r_history, 500)

#Lit les données de références de route et trips
reference_routes = read_koda_reference_data(r_reference, "routes")
reference_trips = read_koda_reference_data(r_reference, "trips")

16:57:50 | INFO | Batch 1–500 / 6039 (8.3%)


Nb fichiers .pb: 6039


16:57:53 | INFO | Batch 501–1000 / 6039 (16.6%)
16:57:54 | INFO | Batch 1001–1500 / 6039 (24.8%)
16:57:57 | INFO | Batch 1501–2000 / 6039 (33.1%)
16:58:08 | INFO | Batch 2001–2500 / 6039 (41.4%)
16:58:22 | INFO | Batch 2501–3000 / 6039 (49.7%)
16:58:35 | INFO | Batch 3001–3500 / 6039 (58.0%)
16:58:47 | INFO | Batch 3501–4000 / 6039 (66.2%)
16:59:05 | INFO | Batch 4001–4500 / 6039 (74.5%)
16:59:30 | INFO | Batch 4501–5000 / 6039 (82.8%)
16:59:51 | INFO | Batch 5001–5500 / 6039 (91.1%)
17:00:10 | INFO | Batch 5501–6000 / 6039 (99.4%)
17:00:16 | INFO | Batch 6001–6039 / 6039 (100.0%)


✅ Total entités enregistrées: 4954694
⚠️ Fichiers ignorés: 0
Exemples: []


In [None]:
#Choix du numéro de bus
bus_number = "541"
route_ids_chosed = []
trip_bus_chosed = []

routes_bus_chosed = [
    r for r in reference_routes
    if str(r.get("route_short_name")) == bus_number
]

#Récupère tous les route_id de bus choisis, un ou plusieurs
for route in routes_bus_chosed:
    route_ids_chosed.append(route['route_id'])

# Récupère les trips id des routes choisies
for id in route_ids_chosed:
    current_route = [t for t in reference_trips if t["route_id"] == id]
    trip_bus_chosed.extend(current_route)


In [None]:
REF_TRIPS_CHOOSED_FIELDS = ("route_id", "direction_id")
final_data = []

#Fait un tableau de corrélation avec trips.txt
ref_trips_choosed_corr = corr_array_creation(trip_bus_chosed, "trip_id", REF_TRIPS_CHOOSED_FIELDS)
valid_trip_ids = set(ref_trips_choosed_corr.keys())

# Filtrer les données pour n'avoir que les trip_id correpondant au routes choisies
filtered_history = [
    e for e in history_entities
    if getattr(e, "trip_update", None)
    and getattr(e.trip_update, "trip", None)
    and e.trip_update.trip.trip_id in valid_trip_ids
]

for e in filtered_history:
    final_data.extend(flatten_history_entity_koda(e, ref_trips_choosed_corr))


In [None]:
#Observer les données (pour moi voir si ça fonctionne)
df = pd.DataFrame(final_data)
df.sample(20)

Unnamed: 0,trip_id,direction_id,start_date,vehicle_id,timestamp,stop_sequence,arrival_delay,arrival_time,departure_delay,departure_time
11576,14010000670453960,1,20250317,9031001003003541,1742168422,7,23,1742168849,23,1742168849
183423,14010000644155129,1,20250317,9031001003000112,1742197568,12,189,1742198216,198,1742198225
108416,14010000612338660,1,20250317,9031001003003575,1742192416,26,23,1742192355,56,1742192388
326236,14010000670426666,0,20250317,9031001003003568,1742211441,28,126,1742211411,144,1742211429
772297,14010000598814787,1,20250317,9031001003003550,1742246713,22,14,1742246127,14,1742246127
93605,14010000612338274,1,20250317,9031001003007398,1742191289,16,72,1742190945,83,1742190956
529765,14010000670433773,0,20250317,9031001003000111,1742227007,20,-63,1742227638,-57,1742227644
666359,14010000644173800,1,20250317,9031001003003515,1742236313,29,-14,1742235868,16,1742235898
164507,14010000670421699,0,20250317,9031001003003570,1742196117,36,-5,1742196115,-5,1742196115
508051,14010000670432115,0,20250317,9031001003003516,1742225595,28,106,1742225868,108,1742225870


In [15]:
# exporter en json pour avoir un traces des données si jamais ça replante encore (mais à ne pas conserver)
import json

with open("../data/transport_koda_sample_one_bus.json", "w", encoding="utf-8") as f:
    json.dump(final_data, f, ensure_ascii=False, indent=2)