In [1]:
import pandas as pd
import requests
import os
import io
import py7zr
import zipfile
import csv
import tempfile
from pathlib import Path
from dotenv import load_dotenv
import shutil
import copy
from google.transit import gtfs_realtime_pb2
from google.protobuf.message import DecodeError
import logging
from datetime import datetime, timezone, timedelta
from collections import defaultdict
import random
import gc

load_dotenv()
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")

In [2]:
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")

In [3]:
#Appel aux APIs
def call_koda_api(base_url, date, operator = "sl", endpoint=""):
    if endpoint != "":
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}/{endpoint}"
    else:
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}"
        
    params = {
        "date": date, 
        "key": KODA_KEY
    }
    
    request = requests.get(f"{api_url}", params=params, timeout=30)
    print(api_url)
    print(request)
    print(len(request.content), "BYTES")

    return request

def call_koda_history_api(date):
    request = call_koda_api("gtfs-rt", date, endpoint="TripUpdates")
    return request

def call_koda_reference_api(date):
    request = call_koda_api("gtfs-static", date)
    return request

In [8]:
#Ouvre les fichiers et lit par batch une journée d'historique
def read_koda_history_day_stream(request, items_by_batch=400):
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
        datefmt="%H:%M:%S"
    )
    logger = logging.getLogger("gtfs")
    bad_files = []

    def _iter_entities():
        archive_bytes = io.BytesIO(request.content)
        tmpdir = tempfile.mkdtemp(prefix="koda_")
        tmp = Path(tmpdir)
        feed = gtfs_realtime_pb2.FeedMessage()

        try:
            archive_bytes.seek(0)
            with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                candidates = [n for n in z.getnames() if n.lower().endswith(".pb")]

            for i in range(0, len(candidates), items_by_batch):
                
                batch = candidates[i:i+items_by_batch]
                logger.info(
                    "Batch %d–%d / %d (%.1f%%)",
                    i + 1,
                    min(i + items_by_batch, len(candidates)),
                    len(candidates),
                    100 * (i + len(batch)) / len(candidates)
                )
                try:
                    archive_bytes.seek(0)
                    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                        z.extract(path=tmpdir, targets=batch)
                except Exception as e:
                    for name in batch:
                        bad_files.append((name, f"ExtractError: {e!r}"))
                    continue

                for name in batch:
                    p = tmp / name
                    try:
                        raw = p.read_bytes()
                        feed.Clear()
                        feed.ParseFromString(raw)

                        for entity in feed.entity:
                            yield entity   # ✅ streaming

                    except Exception as e:
                        bad_files.append((name, f"Read/ParseError: {e!r}"))
                    finally:
                        p.unlink(missing_ok=True)

        finally:
            shutil.rmtree(tmpdir, ignore_errors=True)
            print("⚠️ Fichiers ignorés:", len(bad_files))

    return _iter_entities(), bad_files


# Lit les fichier de référence .txt
def read_koda_reference_data(request, file_name):
    archive_bytes = io.BytesIO(request.content)

    with zipfile.ZipFile(archive_bytes, "r") as z:
        with z.open(f"{file_name}.txt") as f:
            text = io.TextIOWrapper(f, encoding="utf-8")
            reader = csv.DictReader(text)
            return list(reader)

In [5]:
# Crée une tableu de correspondance basé sur une colonne
def corr_array_creation(reference_data, id_key, ref_fields:tuple):
    """
    Création d'un tableau de correspondance pour gagner du temps au merge des dict().
    `reference_data`: ,
    `id_key`: nom de la column à mettre en avant,
    `ref_fields`: liste des colonnes à prendre. ('nom_1', "nom_2",...)
    """
    ref = {}
    for r in reference_data:
        tid = r.get(id_key)
        if not tid:
            continue
        # on stocke seulement ce dont on a besoin
        ref[tid] = {k: r.get(k) for k in ref_fields}
    
    return ref

In [6]:
#Choisi la structure de sortie
def flatten_history_entity_koda(history_items, trips_corr):
    # si pas de trip_update -> rien
    if not history_items.HasField("trip_update"):
        return  # stop (fonction generator: pas de yield)

    tu = history_items.trip_update
    trip = tu.trip
    tid = trip.trip_id

    # MERGE + FILTRE ici
    corr = trips_corr.get(tid)
    if corr is None:
        return  # trip_id pas choisi => on skip direct

    #route_id = corr.get("route_id")
    direction_id = corr.get("direction_id")

    start_date = trip.start_date if trip.HasField("start_date") else None
    feed_ts = tu.timestamp if tu.HasField("timestamp") else None
    vehicle_id = tu.vehicle.id if tu.HasField("vehicle") else None

    for stu in tu.stop_time_update:
        yield {
            "timestamp": feed_ts,
            "trip_id": tid,
            #"route_id": route_id,
            "direction_id": direction_id,
            #"start_date": start_date,
            #"vehicle_id": vehicle_id,
            "stop_sequence": stu.stop_sequence,
            #"arrival_time": stu.arrival.time if stu.HasField("arrival") else None,
            #"departure_time": stu.departure.time if stu.HasField("departure") else None,
            "arrival_delay": stu.arrival.delay if stu.HasField("arrival") else None,
            "departure_delay": stu.departure.delay if stu.HasField("departure") else None,
        }


---


In [9]:
date_begin = "2025-03-15"
date_end = "2025-03-17"

start = datetime.strptime(date_begin, "%Y-%m-%d")
end = datetime.strptime(date_end, "%Y-%m-%d")

all_datas = []

current = start
while current <= end:
    #Choix du numéro de bus
    bus_number = "541"
    route_ids_chosed = []
    trip_bus_chosed = []

    filtered_data = []
    MAX_PER_GROUP = 2

    current_string = current.strftime("%Y-%m-%d")

    # New day
    print(current_string)

    r_history = call_koda_history_api(current_string)
    r_reference = call_koda_reference_api(current_string)

    #Récupération des données
    #Lit les données d'historique par batch
    history_entities, bad_files = read_koda_history_day_stream(r_history, 500)

    #Lit les données de références de route et trips
    reference_routes = read_koda_reference_data(r_reference, "routes")
    reference_trips = read_koda_reference_data(r_reference, "trips")


    routes_bus_chosed = [
        r for r in reference_routes
        if str(r.get("route_short_name")) == bus_number
    ]

    #Récupère tous les route_id de bus choisis, un ou plusieurs
    for route in routes_bus_chosed:
        route_ids_chosed.append(route['route_id'])

    # Récupère les trips id des routes choisies
    for id in route_ids_chosed:
        current_route = [t for t in reference_trips if t["route_id"] == id]
        trip_bus_chosed.extend(current_route)

    REF_TRIPS_CHOOSED_FIELDS = ("route_id", "direction_id")
    final_data = []

    #Fait un tableau de corrélation avec trips.txt
    ref_trips_choosed_corr = corr_array_creation(trip_bus_chosed, "trip_id", REF_TRIPS_CHOOSED_FIELDS)
    valid_trip_ids = set(ref_trips_choosed_corr.keys())

    # Filtrer les données pour n'avoir que les trip_id correpondant au routes choisies
    filtered_history = [
        e for e in history_entities
        if getattr(e, "trip_update", None)
        and getattr(e.trip_update, "trip", None)
        and e.trip_update.trip.trip_id in valid_trip_ids
    ]

    for e in filtered_history:
        final_data.extend(flatten_history_entity_koda(e, ref_trips_choosed_corr))

    for row in final_data:
        row.pop("trip_id", None)


        #### Ce qu'il y a en dessous c'est pour arrondir l'heure @Nadège
        ts = row.get("timestamp")

        ts_hour = ((ts + 1800) // 3600) * 3600
        row["timestamp_hour"] = ts_hour

        row["datetime_rounded"] = datetime.fromtimestamp(ts_hour, tz=timezone.utc).isoformat()
        row["hour"] = (row["timestamp_hour"] // 3600) % 24
        row.pop("timestamp_hour", None)
        #### Ce qu'il y a au-dessus c'est pour arrondir l'heure @Nadège


        # Mettre le numéro du bus pour entrainer le modèle avec plusieurs potentiel num de bus, à rendre dynamique si plusieurs
        row['bus_nbr'] = bus_number

    counts = defaultdict(int)

    #Pour randomiser et ne pas prendre que les deux première data de la journée
    random.shuffle(final_data)

    for row in final_data:
        # Filtrer pour deux bus par heure et pas plus 
        key = (
            row.get("hour"),
            row.get("direction_id"),
            row.get("stop_sequence"),
        )

        if counts[key] < MAX_PER_GROUP:
            #filtered_data.append(row)
            all_datas.append(row)
            counts[key] += 1

    del r_history, r_reference
    del history_entities, bad_files
    del reference_routes, reference_trips
    del routes_bus_chosed, route_ids_chosed, trip_bus_chosed
    del ref_trips_choosed_corr, valid_trip_ids
    del filtered_data, final_data, counts

    gc.collect()
    # Ajoute un jour
    current += timedelta(days=1)

2025-03-15
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-rt/sl/TripUpdates
<Response [200]>
41448769 BYTES
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-static/sl
<Response [200]>
52438156 BYTES


17:31:58 | INFO | Batch 1–500 / 6031 (8.3%)
17:32:00 | INFO | Batch 501–1000 / 6031 (16.6%)
17:32:01 | INFO | Batch 1001–1500 / 6031 (24.9%)
17:32:03 | INFO | Batch 1501–2000 / 6031 (33.2%)
17:32:05 | INFO | Batch 2001–2500 / 6031 (41.5%)
17:32:08 | INFO | Batch 2501–3000 / 6031 (49.7%)
17:32:13 | INFO | Batch 3001–3500 / 6031 (58.0%)
17:32:19 | INFO | Batch 3501–4000 / 6031 (66.3%)
17:32:25 | INFO | Batch 4001–4500 / 6031 (74.6%)
17:32:32 | INFO | Batch 4501–5000 / 6031 (82.9%)
17:32:39 | INFO | Batch 5001–5500 / 6031 (91.2%)
17:32:46 | INFO | Batch 5501–6000 / 6031 (99.5%)
17:32:53 | INFO | Batch 6001–6031 / 6031 (100.0%)


⚠️ Fichiers ignorés: 0
2025-03-16
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-rt/sl/TripUpdates
<Response [200]>
39325833 BYTES
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-static/sl
<Response [200]>
52451928 BYTES


17:33:33 | INFO | Batch 1–500 / 6048 (8.3%)
17:33:35 | INFO | Batch 501–1000 / 6048 (16.5%)
17:33:36 | INFO | Batch 1001–1500 / 6048 (24.8%)
17:33:38 | INFO | Batch 1501–2000 / 6048 (33.1%)
17:33:40 | INFO | Batch 2001–2500 / 6048 (41.3%)
17:33:44 | INFO | Batch 2501–3000 / 6048 (49.6%)
17:33:49 | INFO | Batch 3001–3500 / 6048 (57.9%)
17:33:54 | INFO | Batch 3501–4000 / 6048 (66.1%)
17:34:01 | INFO | Batch 4001–4500 / 6048 (74.4%)
17:34:08 | INFO | Batch 4501–5000 / 6048 (82.7%)
17:34:15 | INFO | Batch 5001–5500 / 6048 (90.9%)
17:34:22 | INFO | Batch 5501–6000 / 6048 (99.2%)
17:34:30 | INFO | Batch 6001–6048 / 6048 (100.0%)


⚠️ Fichiers ignorés: 0
2025-03-17
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-rt/sl/TripUpdates
<Response [200]>
70434472 BYTES
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-static/sl
<Response [200]>
52437883 BYTES


17:35:21 | INFO | Batch 1–500 / 6039 (8.3%)
17:35:22 | INFO | Batch 501–1000 / 6039 (16.6%)
17:35:24 | INFO | Batch 1001–1500 / 6039 (24.8%)
17:35:26 | INFO | Batch 1501–2000 / 6039 (33.1%)
17:35:32 | INFO | Batch 2001–2500 / 6039 (41.4%)
17:35:40 | INFO | Batch 2501–3000 / 6039 (49.7%)
17:35:47 | INFO | Batch 3001–3500 / 6039 (58.0%)
17:35:56 | INFO | Batch 3501–4000 / 6039 (66.2%)
17:36:07 | INFO | Batch 4001–4500 / 6039 (74.5%)
17:36:21 | INFO | Batch 4501–5000 / 6039 (82.8%)
17:36:34 | INFO | Batch 5001–5500 / 6039 (91.1%)
17:36:46 | INFO | Batch 5501–6000 / 6039 (99.4%)
17:36:48 | INFO | Batch 6001–6039 / 6039 (100.0%)


⚠️ Fichiers ignorés: 0


In [None]:
len(all_datas)

9210

In [None]:
df = pd.DataFrame(all_datas)
df.head(20)

Unnamed: 0,timestamp,direction_id,stop_sequence,arrival_delay,departure_delay,datetime_rounded,hour,bus_nbr
0,1742063355,0,30,107,118,2025-03-15T18:00:00+00:00,18,541
1,1741999573,0,27,-25,-25,2025-03-15T01:00:00+00:00,1,541
2,1742020939,0,11,61,62,2025-03-15T07:00:00+00:00,7,541
3,1742024441,1,36,-228,-224,2025-03-15T08:00:00+00:00,8,541
4,1742059478,1,26,101,108,2025-03-15T17:00:00+00:00,17,541
5,1742068390,1,19,32,35,2025-03-15T20:00:00+00:00,20,541
6,1742062036,1,34,-3,4,2025-03-15T18:00:00+00:00,18,541
7,1742024222,0,26,0,0,2025-03-15T08:00:00+00:00,8,541
8,1742066847,0,22,-30,-26,2025-03-15T19:00:00+00:00,19,541
9,1742064115,0,29,-100,0,2025-03-15T19:00:00+00:00,19,541


In [None]:
# exporter en json pour avoir un traces des données si jamais ça replante encore (mais à ne pas conserver)
import json

with open("../data/transport_koda_one_bus_any_days.json", "w", encoding="utf-8") as f:
    json.dump(all_datas, f, ensure_ascii=False, indent=2)