# Import

In [None]:
import pandas as pd
import requests
import json
import os
import io
import py7zr
import zipfile
import csv
import tempfile
from datetime import datetime
from pathlib import Path
from itertools import islice
from collections import defaultdict
from dotenv import load_dotenv
import shutil
import copy
from google.transit import gtfs_realtime_pb2
from google.protobuf.message import DecodeError
import logging

load_dotenv()

True

---

# Variables d'environements

In [2]:
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")

---

# Fonctions

In [3]:
#Appel aux APIs
def call_koda_api(base_url, date, operator = "sl", endpoint=""):
    if endpoint != "":
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}/{endpoint}"
    else:
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}"
        
    params = {
        "date": date, 
        "key": KODA_KEY
    }
    
    request = requests.get(f"{api_url}", params=params, timeout=30)
    print(api_url)
    print(request)
    print(len(request.content), "BYTES")

    return request

def call_koda_history_api(date):
    request = call_koda_api("gtfs-rt", date, endpoint="TripUpdates")
    return request

def call_koda_reference_api(date):
    request = call_koda_api("gtfs-static", date)
    return request

In [4]:
#Ouvre les fichiers et lit par batch une journée d'historique
def read_koda_history_day(request, items_by_batch=400):
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
        datefmt="%H:%M:%S"
    )

    logger = logging.getLogger("gtfs")

    archive_bytes = io.BytesIO(request.content)
    
    # FICHIER TEMPORAIRES
    tmpdir = tempfile.mkdtemp(prefix="koda_")
    tmp = Path(tmpdir)

    history_entities = []
    bad_files = []

    BATCH = items_by_batch

    feed = gtfs_realtime_pb2.FeedMessage()

    # 1) liste des candidats
    archive_bytes.seek(0)
    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
        candidates = [n for n in z.getnames() if n.lower().endswith(".pb")]

    print("Nb fichiers .pb:", len(candidates))

    # 2) extraction + parse par batch
    for i in range(0, len(candidates), BATCH):
        batch = candidates[i:i+BATCH]

        logger.info(
            "Batch %d–%d / %d (%.1f%%)",
            i + 1,
            min(i + BATCH, len(candidates)),
            len(candidates),
            100 * (i + len(batch)) / len(candidates)
        )

        try:
            archive_bytes.seek(0)
            with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                z.extract(path=tmpdir, targets=batch)
        except Exception as e:
            # si un batch plante, on retombe en mode "un par un" juste pour ce batch
            for name in batch:
                try:
                    archive_bytes.seek(0)
                    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                        z.extract(path=tmpdir, targets=[name])
                except Exception as e2:
                    bad_files.append((name, f"ExtractError: {e2!r}"))
            continue

        for name in batch:
            p = tmp / name
            try:
                raw = p.read_bytes()
                feed.Clear()
                try:
                    feed.ParseFromString(raw)
                except DecodeError as de:
                    bad_files.append((name, f"DecodeError: {de!r}"))
                    continue

                for entity in feed.entity:
                    history_entities.append(copy.deepcopy(entity))

            except Exception as e:
                bad_files.append((name, f"Read/ParseError: {e!r}"))
            finally:
                p.unlink(missing_ok=True)

    shutil.rmtree(tmpdir, ignore_errors=True)

    print("✅ Total entités enregistrées:", len(history_entities))
    print("⚠️ Fichiers ignorés:", len(bad_files))
    print("Exemples:", bad_files[:5])

    return history_entities, bad_files

In [5]:
def read_koda_reference_data(request, file_name):
    archive_bytes = io.BytesIO(request.content)

    with zipfile.ZipFile(archive_bytes, "r") as z:
        with z.open(f"{file_name}.txt") as f:
            text = io.TextIOWrapper(f, encoding="utf-8")
            reader = csv.DictReader(text)
            return list(reader)

In [6]:
def corr_array_creation(reference_data, id_key, ref_fields:tuple):
    """
    Création d'un tableau de correspondance pour gagner du temps au merge des dict().
    `reference_data`: ,
    `id_key`: nom de la column à mettre en avant,
    `ref_fields`: liste des colonnes à prendre. ('nom_1', "nom_2",...)
    """
    ref = {}
    for r in reference_data:
        tid = r.get(id_key)
        if not tid:
            continue
        # on stocke seulement ce dont on a besoin
        ref[tid] = {k: r.get(k) for k in ref_fields}
    
    return ref

---

# Code

In [10]:
date_day = "2025-03-17"

In [11]:
r_history = call_koda_history_api(date_day)

https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-rt/sl/TripUpdates
<Response [200]>
70434472 BYTES


In [12]:
r_reference = call_koda_reference_api(date_day)

https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-static/sl
<Response [200]>
52437883 BYTES


In [13]:
history_entities, bad_files = read_koda_history_day(r_history, 500)

16:25:26 | INFO | Batch 1–500 / 6039 (8.3%)


Nb fichiers .pb: 6039


16:25:28 | INFO | Batch 501–1000 / 6039 (16.6%)
16:25:29 | INFO | Batch 1001–1500 / 6039 (24.8%)
16:25:32 | INFO | Batch 1501–2000 / 6039 (33.1%)
16:25:43 | INFO | Batch 2001–2500 / 6039 (41.4%)
16:25:58 | INFO | Batch 2501–3000 / 6039 (49.7%)
16:26:12 | INFO | Batch 3001–3500 / 6039 (58.0%)
16:26:24 | INFO | Batch 3501–4000 / 6039 (66.2%)
16:26:43 | INFO | Batch 4001–4500 / 6039 (74.5%)
16:27:08 | INFO | Batch 4501–5000 / 6039 (82.8%)
16:27:27 | INFO | Batch 5001–5500 / 6039 (91.1%)
16:27:44 | INFO | Batch 5501–6000 / 6039 (99.4%)
16:27:50 | INFO | Batch 6001–6039 / 6039 (100.0%)


✅ Total entités enregistrées: 4954694
⚠️ Fichiers ignorés: 0
Exemples: []


In [14]:
#Vérifie que j'ai bien recu quelque chose (pour moi)
for e in history_entities[:20]:
    print(e)

id: "14010516479723637"
trip_update {
  trip {
    trip_id: "14010000668271329"
    start_date: "20250316"
    schedule_relationship: SCHEDULED
  }
  vehicle {
    id: "9031001004302576"
  }
  stop_time_update {
    stop_sequence: 25
    stop_id: "9022001006071004"
    arrival {
      delay: -62
      time: 1742165338
      uncertainty: 0
    }
    departure {
      delay: 5
      time: 1742165405
      uncertainty: 0
    }
  }
  stop_time_update {
    stop_sequence: 26
    stop_id: "9022001006081002"
    arrival {
      delay: -58
      time: 1742165642
      uncertainty: 0
    }
    departure {
      delay: 14
      time: 1742165714
      uncertainty: 0
    }
  }
  stop_time_update {
    stop_sequence: 27
    stop_id: "9022001006091002"
    arrival {
      delay: -49
      time: 1742165951
      uncertainty: 0
    }
    departure {
      delay: 0
      time: 1742166000
    }
  }
  stop_time_update {
    stop_sequence: 28
    stop_id: "9022001006101001"
    arrival {
      delay: 0
  

In [15]:
reference_routes = read_koda_reference_data(r_reference, "routes")
reference_routes

[{'route_id': '9011001000100000',
  'agency_id': '14010000000001001',
  'route_short_name': '1',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000200000',
  'agency_id': '14010000000001001',
  'route_short_name': '2',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000300000',
  'agency_id': '14010000000001001',
  'route_short_name': '3',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000400000',
  'agency_id': '14010000000001001',
  'route_short_name': '4',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000600000',
  'agency_id': '14010000000001001',
  'route_short_name': '6',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000700000',
  'agency_id': '14010000000001001',
  'route_short_name': '7',
  'route_long_name': 'Spårväg city',
 

In [16]:
#Choix du numéro de bus
bus_number = "541"

routes_bus_choosed = [
    r for r in reference_routes
    if str(r.get("route_short_name")) == bus_number
]

In [17]:
len(routes_bus_choosed)

1

In [18]:
routes_bus_choosed

[{'route_id': '9011001054100000',
  'agency_id': '14010000000001001',
  'route_short_name': '541',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': ''}]

In [19]:
route_ids_choosed = []
for route in routes_bus_choosed:
    #route_id = route['route_id']

    route_ids_choosed.append(route['route_id'])

route_ids_choosed

['9011001054100000']

In [20]:
reference_trips = read_koda_reference_data(r_reference, "trips")
reference_trips

[{'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664276697',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664277107',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664278217',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664283844',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664284724',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664287226',
  'trip_headsign': '',
  'direction_id': '1',


In [21]:
trip_bus_choosed = []

for id in route_ids_choosed:
    current_route = [t for t in reference_trips if t["route_id"] == id]
    trip_bus_choosed.extend(current_route)

print(len(trip_bus_choosed))
print(trip_bus_choosed)

353
[{'route_id': '9011001054100000', 'service_id': '1', 'trip_id': '14010000598814991', 'trip_headsign': '', 'direction_id': '1', 'shape_id': '1014010000581930482'}, {'route_id': '9011001054100000', 'service_id': '1', 'trip_id': '14010000598815299', 'trip_headsign': '', 'direction_id': '1', 'shape_id': '1014010000581930482'}, {'route_id': '9011001054100000', 'service_id': '1', 'trip_id': '14010000598815401', 'trip_headsign': '', 'direction_id': '1', 'shape_id': '1014010000581930482'}, {'route_id': '9011001054100000', 'service_id': '9', 'trip_id': '14010000598821844', 'trip_headsign': '', 'direction_id': '1', 'shape_id': '1014010000581930482'}, {'route_id': '9011001054100000', 'service_id': '9', 'trip_id': '14010000598822048', 'trip_headsign': '', 'direction_id': '1', 'shape_id': '1014010000581930482'}, {'route_id': '9011001054100000', 'service_id': '9', 'trip_id': '14010000598822153', 'trip_headsign': '', 'direction_id': '1', 'shape_id': '1014010000581930482'}, {'route_id': '901100105

In [22]:
REF_TRIPS_CHOOSED_FIELDS = ("route_id", "direction_id")

ref_trips_choosed_corr = corr_array_creation(trip_bus_choosed, "trip_id", REF_TRIPS_CHOOSED_FIELDS)
ref_trips_choosed_corr

{'14010000598814991': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598815299': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598815401': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598821844': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598822048': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598822153': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000612338984': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000612339172': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598814787': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598814888': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598815607': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000598821948': {'route_id': '9011001054100000', 'direction_id': '1'},
 '14010000602771155': {'route_id': '9011001054100000', 'direction_id': '1'},

In [None]:
# ATTENTION HISTORY DATA N'AFFICHE PAS ROUTES ET TRIPS, il n'affiche que les data brutes
# Recupérer 2 bus par heure c'est suffisant (meme numéro de bus)
# Et faire pour 365 jours et pas que 1 jours

In [25]:
valid_trip_ids = set(ref_trips_choosed_corr.keys())

In [None]:
filtered_history = [
    e for e in history_entities
    if getattr(e, "trip_update", None)
    and getattr(e.trip_update, "trip", None)
    and e.trip_update.trip.trip_id in valid_trip_ids
]
filtered_history

In [28]:
len(filtered_history)

36483

In [40]:
def flatten_history_entity_koda(history_items, trips_corr):
    # si pas de trip_update -> rien
    if not history_items.HasField("trip_update"):
        return  # stop (fonction generator: pas de yield)

    tu = history_items.trip_update
    trip = tu.trip
    tid = trip.trip_id

    # MERGE + FILTRE ici
    corr = trips_corr.get(tid)
    if corr is None:
        return  # trip_id pas choisi => on skip direct

    #route_id = corr.get("route_id")
    direction_id = corr.get("direction_id")

    start_date = trip.start_date if trip.HasField("start_date") else None
    feed_ts = tu.timestamp if tu.HasField("timestamp") else None
    vehicle_id = tu.vehicle.id if tu.HasField("vehicle") else None

    for stu in tu.stop_time_update:
        yield {
            "trip_id": tid,
            #"route_id": route_id,
            "direction_id": direction_id,
            "start_date": start_date,
            "vehicle_id": vehicle_id,
            "timestamp": feed_ts,
            "stop_sequence": stu.stop_sequence,
            "arrival_delay": stu.arrival.delay if stu.HasField("arrival") else None,
            "arrival_time": stu.arrival.time if stu.HasField("arrival") else None,
            "departure_delay": stu.departure.delay if stu.HasField("departure") else None,
            "departure_time": stu.departure.time if stu.HasField("departure") else None,
        }


In [41]:
final_data = []
for e in filtered_history:
    final_data.extend(flatten_history_entity_koda(e, ref_trips_choosed_corr))


In [44]:
df = pd.DataFrame(final_data)
df.sample(20)

Unnamed: 0,trip_id,direction_id,start_date,vehicle_id,timestamp,stop_sequence,arrival_delay,arrival_time,departure_delay,departure_time
693169,14010000670438373,0,20250317,9031001003003507,1742239070,20,164,1742238965,164,1742238965
150200,14010000670421996,1,20250317,9031001003003528,1742195184,12,322,1742194703,334,1742194715
299733,14010000644158677,1,20250317,9031001003003568,1742208833,30,280,1742208429,300,1742208449
618168,14010000670437228,0,20250317,9031001003003547,1742232485,2,20,1742232571,26,1742232577
491153,14010000670431703,1,20250317,9031001003003514,1742224507,26,170,1742225232,179,1742225241
535998,14010000670433773,0,20250317,9031001003000111,1742227422,33,-77,1742228614,-67,1742228624
433746,14010000670430125,1,20250317,9031001003000111,1742220958,13,129,1742221696,139,1742221706
344525,14010000670426854,0,20250317,9031001003003565,1742213097,31,39,1742212642,60,1742212663
86354,14010000612338274,1,20250317,9031001003007398,1742190697,8,-52,1742190428,25,1742190505
464992,14010000670431092,1,20250317,9031001003003583,1742222921,34,211,1742225106,218,1742225113
