In [81]:
import pandas as pd
import requests
import os
import io
import py7zr
import zipfile
import csv
import tempfile
from pathlib import Path
from dotenv import load_dotenv
import shutil
import copy
from google.transit import gtfs_realtime_pb2
from google.protobuf.message import DecodeError
import logging
from datetime import datetime, timezone
from collections import defaultdict
import random

load_dotenv()
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")

In [2]:
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")

In [3]:
#Appel aux APIs
def call_koda_api(base_url, date, operator = "sl", endpoint=""):
    if endpoint != "":
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}/{endpoint}"
    else:
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}"
        
    params = {
        "date": date, 
        "key": KODA_KEY
    }
    
    request = requests.get(f"{api_url}", params=params, timeout=30)
    print(api_url)
    print(request)
    print(len(request.content), "BYTES")

    return request

def call_koda_history_api(date):
    request = call_koda_api("gtfs-rt", date, endpoint="TripUpdates")
    return request

def call_koda_reference_api(date):
    request = call_koda_api("gtfs-static", date)
    return request

In [4]:
#Ouvre les fichiers et lit par batch une journée d'historique
def read_koda_history_day(request, items_by_batch=400):
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
        datefmt="%H:%M:%S"
    )

    logger = logging.getLogger("gtfs")

    archive_bytes = io.BytesIO(request.content)
    
    # FICHIER TEMPORAIRES
    tmpdir = tempfile.mkdtemp(prefix="koda_")
    tmp = Path(tmpdir)

    history_entities = []
    bad_files = []

    BATCH = items_by_batch

    feed = gtfs_realtime_pb2.FeedMessage()

    # 1) liste des candidats
    archive_bytes.seek(0)
    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
        candidates = [n for n in z.getnames() if n.lower().endswith(".pb")]

    print("Nb fichiers .pb:", len(candidates))

    # 2) extraction + parse par batch
    for i in range(0, len(candidates), BATCH):
        batch = candidates[i:i+BATCH]

        logger.info(
            "Batch %d–%d / %d (%.1f%%)",
            i + 1,
            min(i + BATCH, len(candidates)),
            len(candidates),
            100 * (i + len(batch)) / len(candidates)
        )

        try:
            archive_bytes.seek(0)
            with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                z.extract(path=tmpdir, targets=batch)
        except Exception as e:
            # si un batch plante, on retombe en mode "un par un" juste pour ce batch
            for name in batch:
                try:
                    archive_bytes.seek(0)
                    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                        z.extract(path=tmpdir, targets=[name])
                except Exception as e2:
                    bad_files.append((name, f"ExtractError: {e2!r}"))
            continue

        for name in batch:
            p = tmp / name
            try:
                raw = p.read_bytes()
                feed.Clear()
                try:
                    feed.ParseFromString(raw)
                except DecodeError as de:
                    bad_files.append((name, f"DecodeError: {de!r}"))
                    continue

                for entity in feed.entity:
                    history_entities.append(copy.deepcopy(entity))

            except Exception as e:
                bad_files.append((name, f"Read/ParseError: {e!r}"))
            finally:
                p.unlink(missing_ok=True)

    shutil.rmtree(tmpdir, ignore_errors=True)

    print("✅ Total entités enregistrées:", len(history_entities))
    print("⚠️ Fichiers ignorés:", len(bad_files))
    print("Exemples:", bad_files[:5])

    return history_entities, bad_files

# Lit les fichier de référence .txt
def read_koda_reference_data(request, file_name):
    archive_bytes = io.BytesIO(request.content)

    with zipfile.ZipFile(archive_bytes, "r") as z:
        with z.open(f"{file_name}.txt") as f:
            text = io.TextIOWrapper(f, encoding="utf-8")
            reader = csv.DictReader(text)
            return list(reader)

In [5]:
# Crée une tableu de correspondance basé sur une colonne
def corr_array_creation(reference_data, id_key, ref_fields:tuple):
    """
    Création d'un tableau de correspondance pour gagner du temps au merge des dict().
    `reference_data`: ,
    `id_key`: nom de la column à mettre en avant,
    `ref_fields`: liste des colonnes à prendre. ('nom_1', "nom_2",...)
    """
    ref = {}
    for r in reference_data:
        tid = r.get(id_key)
        if not tid:
            continue
        # on stocke seulement ce dont on a besoin
        ref[tid] = {k: r.get(k) for k in ref_fields}
    
    return ref

In [28]:
#Choisi la structure de sortie
def flatten_history_entity_koda(history_items, trips_corr):
    # si pas de trip_update -> rien
    if not history_items.HasField("trip_update"):
        return  # stop (fonction generator: pas de yield)

    tu = history_items.trip_update
    trip = tu.trip
    tid = trip.trip_id

    # MERGE + FILTRE ici
    corr = trips_corr.get(tid)
    if corr is None:
        return  # trip_id pas choisi => on skip direct

    #route_id = corr.get("route_id")
    direction_id = corr.get("direction_id")

    start_date = trip.start_date if trip.HasField("start_date") else None
    feed_ts = tu.timestamp if tu.HasField("timestamp") else None
    vehicle_id = tu.vehicle.id if tu.HasField("vehicle") else None

    for stu in tu.stop_time_update:
        yield {
            "timestamp": feed_ts,
            "trip_id": tid,
            #"route_id": route_id,
            "direction_id": direction_id,
            #"start_date": start_date,
            #"vehicle_id": vehicle_id,
            "stop_sequence": stu.stop_sequence,
            #"arrival_time": stu.arrival.time if stu.HasField("arrival") else None,
            #"departure_time": stu.departure.time if stu.HasField("departure") else None,
            "arrival_delay": stu.arrival.delay if stu.HasField("arrival") else None,
            "departure_delay": stu.departure.delay if stu.HasField("departure") else None,
        }


---


In [7]:
# Execute les calls d'APIs
date_day = "2025-03-17"
r_history = call_koda_history_api(date_day)
r_reference = call_koda_reference_api(date_day)

https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-rt/sl/TripUpdates
<Response [200]>
70434472 BYTES
https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-static/sl
<Response [200]>
52437883 BYTES


In [8]:
#Récupération des données
#Lit les données d'historique par batch
history_entities, bad_files = read_koda_history_day(r_history, 500)

#Lit les données de références de route et trips
reference_routes = read_koda_reference_data(r_reference, "routes")
reference_trips = read_koda_reference_data(r_reference, "trips")

12:54:04 | INFO | Batch 1–500 / 6039 (8.3%)


Nb fichiers .pb: 6039


12:54:06 | INFO | Batch 501–1000 / 6039 (16.6%)
12:54:07 | INFO | Batch 1001–1500 / 6039 (24.8%)
12:54:10 | INFO | Batch 1501–2000 / 6039 (33.1%)
12:54:20 | INFO | Batch 2001–2500 / 6039 (41.4%)
12:54:31 | INFO | Batch 2501–3000 / 6039 (49.7%)
12:54:42 | INFO | Batch 3001–3500 / 6039 (58.0%)
12:54:53 | INFO | Batch 3501–4000 / 6039 (66.2%)
12:55:10 | INFO | Batch 4001–4500 / 6039 (74.5%)
12:55:35 | INFO | Batch 4501–5000 / 6039 (82.8%)
12:55:53 | INFO | Batch 5001–5500 / 6039 (91.1%)
12:56:09 | INFO | Batch 5501–6000 / 6039 (99.4%)
12:56:15 | INFO | Batch 6001–6039 / 6039 (100.0%)


✅ Total entités enregistrées: 4954694
⚠️ Fichiers ignorés: 0
Exemples: []


In [29]:
#Choix du numéro de bus
bus_number = "541"
route_ids_chosed = []
trip_bus_chosed = []

routes_bus_chosed = [
    r for r in reference_routes
    if str(r.get("route_short_name")) == bus_number
]

#Récupère tous les route_id de bus choisis, un ou plusieurs
for route in routes_bus_chosed:
    route_ids_chosed.append(route['route_id'])

# Récupère les trips id des routes choisies
for id in route_ids_chosed:
    current_route = [t for t in reference_trips if t["route_id"] == id]
    trip_bus_chosed.extend(current_route)


In [None]:
REF_TRIPS_CHOOSED_FIELDS = ("route_id", "direction_id")
final_data = []

#Fait un tableau de corrélation avec trips.txt
ref_trips_choosed_corr = corr_array_creation(trip_bus_chosed, "trip_id", REF_TRIPS_CHOOSED_FIELDS)
valid_trip_ids = set(ref_trips_choosed_corr.keys())

# Filtrer les données pour n'avoir que les trip_id correpondant au routes choisies
filtered_history = [
    e for e in history_entities
    if getattr(e, "trip_update", None)
    and getattr(e.trip_update, "trip", None)
    and e.trip_update.trip.trip_id in valid_trip_ids
]

for e in filtered_history:
    final_data.extend(flatten_history_entity_koda(e, ref_trips_choosed_corr))

for row in final_data:
    row.pop("trip_id", None)


    #### Ce qu'il y a en dessous c'est pour arrondir l'heure @Nadège
    ts = row.get("timestamp")

    ts_hour = ((ts + 1800) // 3600) * 3600
    row["timestamp_hour"] = ts_hour

    row["datetime_rounded"] = datetime.fromtimestamp(ts_hour, tz=timezone.utc).isoformat()
    row["hour"] = (row["timestamp_hour"] // 3600) % 24
    row.pop("timestamp_hour", None)
    #### Ce qu'il y a au-dessus c'est pour arrondir l'heure @Nadège


    # Mettre le numéro du bus pour entrainer le modèle avec plusieurs potentiel num de bus, à rendre dynamique si plusieurs
    row['bus_nbr'] = bus_number

In [None]:
filtered_data = []
MAX_PER_GROUP = 2

counts = defaultdict(int)

#Pour randomiser et ne pas prendre que les deux première data de la journée
random.shuffle(final_data)

for row in final_data:
    # Filtrer pour deux bus par heure et pas plus 
    key = (
        row.get("hour"),
        row.get("direction_id"),
        row.get("stop_sequence"),
    )

    if counts[key] < MAX_PER_GROUP:
        filtered_data.append(row)
        counts[key] += 1

In [85]:
df_filtred = pd.DataFrame(filtered_data)
df_filtred.head(20)

Unnamed: 0,timestamp,direction_id,stop_sequence,arrival_delay,departure_delay,datetime_rounded,hour,bus_nbr
0,1742236682,1,8,283,324,2025-03-17T19:00:00+00:00,19,541
1,1742236429,1,16,110,123,2025-03-17T19:00:00+00:00,19,541
2,1742215110,0,31,-64,-60,2025-03-17T13:00:00+00:00,13,541
3,1742192940,1,12,69,76,2025-03-17T06:00:00+00:00,6,541
4,1742230796,0,1,-120,-168,2025-03-17T17:00:00+00:00,17,541
5,1742236958,1,21,384,394,2025-03-17T19:00:00+00:00,19,541
6,1742191051,0,25,-265,-258,2025-03-17T06:00:00+00:00,6,541
7,1742203793,0,25,-93,-86,2025-03-17T09:00:00+00:00,9,541
8,1742228289,1,19,232,235,2025-03-17T16:00:00+00:00,16,541
9,1742235908,1,22,81,81,2025-03-17T18:00:00+00:00,18,541


In [51]:
#Observer les données (pour moi voir si ça fonctionne)
df = pd.DataFrame(final_data)
df.head(20)

Unnamed: 0,timestamp,direction_id,stop_sequence,arrival_delay,departure_delay,datetime_rounded,hour,bus_nbr
0,1742165813,1,27,43,43,2025-03-16T23:00:00+00:00,23,541
1,1742165813,1,28,42,42,2025-03-16T23:00:00+00:00,23,541
2,1742165813,1,29,-28,-28,2025-03-16T23:00:00+00:00,23,541
3,1742165813,1,30,-43,-43,2025-03-16T23:00:00+00:00,23,541
4,1742165813,1,31,-51,-51,2025-03-16T23:00:00+00:00,23,541
5,1742165813,1,32,-54,-54,2025-03-16T23:00:00+00:00,23,541
6,1742165813,1,33,-60,-60,2025-03-16T23:00:00+00:00,23,541
7,1742165813,1,34,-73,-73,2025-03-16T23:00:00+00:00,23,541
8,1742165813,1,35,-73,-73,2025-03-16T23:00:00+00:00,23,541
9,1742165813,1,36,-189,-189,2025-03-16T23:00:00+00:00,23,541


In [66]:
group_cols = ["hour", "direction_id", "stop_sequence"]

df_limited = (
    df
    .groupby(group_cols, group_keys=False)
    .sample(n=2, replace=False, random_state=42)
)
df_limited

Unnamed: 0,timestamp,direction_id,stop_sequence,arrival_delay,departure_delay,datetime_rounded,hour,bus_nbr
9514,1742167816,0,5,-5,-5,2025-03-17T00:00:00+00:00,0,541
9555,1742167835,0,5,-5,-5,2025-03-17T00:00:00+00:00,0,541
9636,1742167854,0,6,-10,10,2025-03-17T00:00:00+00:00,0,541
9794,1742167914,0,6,-10,10,2025-03-17T00:00:00+00:00,0,541
9516,1742167816,0,7,11,28,2025-03-17T00:00:00+00:00,0,541
...,...,...,...,...,...,...,...,...
8985,1742167556,1,34,-133,-133,2025-03-16T23:00:00+00:00,23,541
120,1742165985,1,35,-45,-45,2025-03-16T23:00:00+00:00,23,541
807615,1742250640,1,35,80,81,2025-03-17T23:00:00+00:00,23,541
808117,1742250670,1,36,-139,-59,2025-03-17T23:00:00+00:00,23,541


In [None]:
# exporter en json pour avoir un traces des données si jamais ça replante encore (mais à ne pas conserver)
import json

with open("../data/transport_koda_one_bus.json", "w", encoding="utf-8") as f:
    json.dump(final_data, f, ensure_ascii=False, indent=2)

In [87]:
# exporter en json pour avoir un traces des données filtrée si jamais ça replante encore (mais à ne pas conserver)
import json

with open("../data/transport_koda_one_bus_filtred_by_hours.json", "w", encoding="utf-8") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

In [76]:
final_data

[{'timestamp': 1742165813,
  'direction_id': '1',
  'stop_sequence': 27,
  'arrival_delay': 43,
  'departure_delay': 43,
  'datetime_rounded': '2025-03-16T23:00:00+00:00',
  'hour': 23,
  'bus_nbr': '541'},
 {'timestamp': 1742165813,
  'direction_id': '1',
  'stop_sequence': 28,
  'arrival_delay': 42,
  'departure_delay': 42,
  'datetime_rounded': '2025-03-16T23:00:00+00:00',
  'hour': 23,
  'bus_nbr': '541'},
 {'timestamp': 1742165813,
  'direction_id': '1',
  'stop_sequence': 29,
  'arrival_delay': -28,
  'departure_delay': -28,
  'datetime_rounded': '2025-03-16T23:00:00+00:00',
  'hour': 23,
  'bus_nbr': '541'},
 {'timestamp': 1742165813,
  'direction_id': '1',
  'stop_sequence': 30,
  'arrival_delay': -43,
  'departure_delay': -43,
  'datetime_rounded': '2025-03-16T23:00:00+00:00',
  'hour': 23,
  'bus_nbr': '541'},
 {'timestamp': 1742165813,
  'direction_id': '1',
  'stop_sequence': 31,
  'arrival_delay': -51,
  'departure_delay': -51,
  'datetime_rounded': '2025-03-16T23:00:00+00

In [78]:
from collections import defaultdict

MAX_PER_GROUP = 2

counts = defaultdict(int)
filtered_data = []

for row in final_data:
    key = (
        row.get("hour"),
        row.get("direction_id"),
        row.get("stop_sequence"),
    )

    if counts[key] < MAX_PER_GROUP:
        filtered_data.append(row)
        counts[key] += 1
