In [1]:
import pandas as pd
import requests
import json
import os
import io
import py7zr
import zipfile
import csv
import tempfile
from datetime import datetime
from pathlib import Path
from itertools import islice
from collections import defaultdict
from dotenv import load_dotenv
import shutil
import copy
from google.transit import gtfs_realtime_pb2
from google.protobuf.message import DecodeError
import logging

load_dotenv()

True

In [2]:
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")  # clé GTFS Regional Realtime

In [3]:
def call_koda_api(base_url, date, operator = "sl", endpoint=""):
    if endpoint != "":
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}/{endpoint}"
    else:
        api_url = f"https://api.koda.trafiklab.se/KoDa/api/v2/{base_url}/{operator}"
        
    params = {
        "date": date, 
        "key": KODA_KEY
    }
    
    request = requests.get(f"{api_url}", params=params, timeout=30)
    print(api_url)
    print(request)
    print(len(request.content), "BYTES")

    return request

def call_koda_history_api(date):
    request = call_koda_api("gtfs-rt", date, endpoint="TripUpdates")
    return request

def call_koda_reference_api(date):
    request = call_koda_api("gtfs-static", date)
    return request

In [4]:
date_day = "2025-03-17"

In [5]:
r_history = call_koda_history_api(date_day)

https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-rt/sl/TripUpdates
<Response [200]>
70434472 BYTES


In [6]:
r_reference = call_koda_reference_api(date_day)

https://api.koda.trafiklab.se/KoDa/api/v2/gtfs-static/sl
<Response [200]>
52437883 BYTES


In [7]:
def read_koda_history_day(request, items_by_batch=400):
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
        datefmt="%H:%M:%S"
    )

    logger = logging.getLogger("gtfs")

    archive_bytes = io.BytesIO(request.content)
    
    # FICHIER TEMPORAIRES
    tmpdir = tempfile.mkdtemp(prefix="koda_")
    tmp = Path(tmpdir)

    history_entities = []
    bad_files = []

    BATCH = items_by_batch

    feed = gtfs_realtime_pb2.FeedMessage()

    # 1) liste des candidats
    archive_bytes.seek(0)
    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
        candidates = [n for n in z.getnames() if n.lower().endswith(".pb")]

    print("Nb fichiers .pb:", len(candidates))

    # 2) extraction + parse par batch
    for i in range(0, len(candidates), BATCH):
        batch = candidates[i:i+BATCH]

        logger.info(
            "Batch %d–%d / %d (%.1f%%)",
            i + 1,
            min(i + BATCH, len(candidates)),
            len(candidates),
            100 * (i + len(batch)) / len(candidates)
        )

        try:
            archive_bytes.seek(0)
            with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                z.extract(path=tmpdir, targets=batch)
        except Exception as e:
            # si un batch plante, on retombe en mode "un par un" juste pour ce batch
            for name in batch:
                try:
                    archive_bytes.seek(0)
                    with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                        z.extract(path=tmpdir, targets=[name])
                except Exception as e2:
                    bad_files.append((name, f"ExtractError: {e2!r}"))
            continue

        for name in batch:
            p = tmp / name
            try:
                raw = p.read_bytes()
                feed.Clear()
                try:
                    feed.ParseFromString(raw)
                except DecodeError as de:
                    bad_files.append((name, f"DecodeError: {de!r}"))
                    continue

                for entity in feed.entity:
                    history_entities.append(copy.deepcopy(entity))

            except Exception as e:
                bad_files.append((name, f"Read/ParseError: {e!r}"))
            finally:
                p.unlink(missing_ok=True)

    shutil.rmtree(tmpdir, ignore_errors=True)

    print("✅ Total entités enregistrées:", len(history_entities))
    print("⚠️ Fichiers ignorés:", len(bad_files))
    print("Exemples:", bad_files[:5])

    return history_entities, bad_files

In [8]:
history_entities, bad_files = read_koda_history_day(r_history, 500)

14:42:12 | INFO | Batch 1–500 / 6039 (8.3%)


Nb fichiers .pb: 6039


14:42:14 | INFO | Batch 501–1000 / 6039 (16.6%)
14:42:15 | INFO | Batch 1001–1500 / 6039 (24.8%)
14:42:18 | INFO | Batch 1501–2000 / 6039 (33.1%)
14:42:28 | INFO | Batch 2001–2500 / 6039 (41.4%)
14:42:39 | INFO | Batch 2501–3000 / 6039 (49.7%)
14:42:50 | INFO | Batch 3001–3500 / 6039 (58.0%)
14:43:01 | INFO | Batch 3501–4000 / 6039 (66.2%)
14:43:19 | INFO | Batch 4001–4500 / 6039 (74.5%)
14:43:44 | INFO | Batch 4501–5000 / 6039 (82.8%)
14:44:04 | INFO | Batch 5001–5500 / 6039 (91.1%)
14:44:20 | INFO | Batch 5501–6000 / 6039 (99.4%)
14:44:26 | INFO | Batch 6001–6039 / 6039 (100.0%)


✅ Total entités enregistrées: 4954694
⚠️ Fichiers ignorés: 0
Exemples: []


In [9]:
for e in history_entities[:20]:
    print(e)

id: "14010516479723637"
trip_update {
  trip {
    trip_id: "14010000668271329"
    start_date: "20250316"
    schedule_relationship: SCHEDULED
  }
  vehicle {
    id: "9031001004302576"
  }
  stop_time_update {
    stop_sequence: 25
    stop_id: "9022001006071004"
    arrival {
      delay: -62
      time: 1742165338
      uncertainty: 0
    }
    departure {
      delay: 5
      time: 1742165405
      uncertainty: 0
    }
  }
  stop_time_update {
    stop_sequence: 26
    stop_id: "9022001006081002"
    arrival {
      delay: -58
      time: 1742165642
      uncertainty: 0
    }
    departure {
      delay: 14
      time: 1742165714
      uncertainty: 0
    }
  }
  stop_time_update {
    stop_sequence: 27
    stop_id: "9022001006091002"
    arrival {
      delay: -49
      time: 1742165951
      uncertainty: 0
    }
    departure {
      delay: 0
      time: 1742166000
    }
  }
  stop_time_update {
    stop_sequence: 28
    stop_id: "9022001006101001"
    arrival {
      delay: 0
  

In [10]:
len(history_entities)

4954694

In [11]:
def read_koda_reference_data(request, file_name):
    archive_bytes = io.BytesIO(request.content)

    with zipfile.ZipFile(archive_bytes, "r") as z:
        with z.open(f"{file_name}.txt") as f:
            text = io.TextIOWrapper(f, encoding="utf-8")
            reader = csv.DictReader(text)
            return list(reader)

In [12]:
reference_trips = read_koda_reference_data(r_reference, "trips")
reference_trips

[{'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664276697',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664277107',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664278217',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664283844',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664284724',
  'trip_headsign': '',
  'direction_id': '1',
  'shape_id': '1014010000482329256'},
 {'route_id': '9011001000100000',
  'service_id': '1',
  'trip_id': '14010000664287226',
  'trip_headsign': '',
  'direction_id': '1',


In [13]:
reference_routes = read_koda_reference_data(r_reference, "routes")
reference_routes

[{'route_id': '9011001000100000',
  'agency_id': '14010000000001001',
  'route_short_name': '1',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000200000',
  'agency_id': '14010000000001001',
  'route_short_name': '2',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000300000',
  'agency_id': '14010000000001001',
  'route_short_name': '3',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000400000',
  'agency_id': '14010000000001001',
  'route_short_name': '4',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000600000',
  'agency_id': '14010000000001001',
  'route_short_name': '6',
  'route_long_name': '',
  'route_type': '700',
  'route_desc': 'blåbuss'},
 {'route_id': '9011001000700000',
  'agency_id': '14010000000001001',
  'route_short_name': '7',
  'route_long_name': 'Spårväg city',
 

In [27]:
def corr_array_creation(reference_data, id_key, ref_fields:tuple):
    """
    Création d'un tableau de correspondance pour gagner du temps au merge des dict().
    `reference_data`: ,
    `id_key`: nom de la column à mettre en avant,
    `ref_fields`: liste des colonnes à prendre. ('nom_1', "nom_2",...)
    """
    ref = {}
    for r in reference_data:
        tid = r.get(id_key)
        if not tid:
            continue
        # on stocke seulement ce dont on a besoin
        ref[tid] = {k: r.get(k) for k in ref_fields}
    
    return ref

In [28]:
REF_TRIPS_FIELDS = ("route_id", "direction_id")
#ATTENTION ERREUR PAS DE CONTENU
ref_trips_corr = corr_array_creation(reference_trips, "trip_id", REF_TRIPS_FIELDS)
ref_trips_corr

{'14010000664276697': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664277107': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664278217': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664283844': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664284724': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664287226': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664282006': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664309301': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664308746': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664315774': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664304117': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664317010': {'route_id': '9011001000100000', 'direction_id': '1'},
 '14010000664321553': {'route_id': '9011001000100000', 'direction_id': '1'},

In [30]:
REF_ROUTES_FIELDS = ("route_short_name", "route_type")

ref_routes_corr = corr_array_creation(reference_routes, "route_id", REF_ROUTES_FIELDS)
ref_routes_corr

{'9011001000100000': {'route_short_name': '1', 'route_type': '700'},
 '9011001000200000': {'route_short_name': '2', 'route_type': '700'},
 '9011001000300000': {'route_short_name': '3', 'route_type': '700'},
 '9011001000400000': {'route_short_name': '4', 'route_type': '700'},
 '9011001000600000': {'route_short_name': '6', 'route_type': '700'},
 '9011001000700000': {'route_short_name': '7', 'route_type': '900'},
 '9011001001000000': {'route_short_name': '10', 'route_type': '401'},
 '9011001001100000': {'route_short_name': '11', 'route_type': '401'},
 '9011001001200000': {'route_short_name': '12', 'route_type': '900'},
 '9011001001300000': {'route_short_name': '13', 'route_type': '401'},
 '9011001001400000': {'route_short_name': '14', 'route_type': '401'},
 '9011001001700000': {'route_short_name': '17', 'route_type': '401'},
 '9011001001800000': {'route_short_name': '18', 'route_type': '401'},
 '9011001001900000': {'route_short_name': '19', 'route_type': '401'},
 '9011001002100000': {'rou

In [18]:
def flatten_history_entity_koda(history_items):
    """Applatit le dict en un seul niveau. `history_item` égale une ligne de données, une entrée."""
    if not history_items.HasField("trip_update"):
        return

    tu = history_items.trip_update
    trip = tu.trip

    tid = trip.trip_id
    start_date = trip.start_date if trip.HasField("start_date") else None
    feed_ts = tu.timestamp if tu.HasField("timestamp") else None
    vehicle_id = tu.vehicle.id if tu.HasField("vehicle") else None

    for stu in tu.stop_time_update:
        yield {
            "entity_id": e.id,
            "trip_id": tid,
            "start_date": start_date,
            "vehicle_id": vehicle_id,
            "feed_ts": feed_ts,
            "schedule_relationship": trip.schedule_relationship, #VERIFIER QUE JE RECOIS BIEN CORRECTEMENT

            "stop_id": stu.stop_id,
            "stop_sequence": stu.stop_sequence,

            "arrival_delay": stu.arrival.delay if stu.HasField("arrival") else None,
            "arrival_time": stu.arrival.time if stu.HasField("arrival") else None,
            "arrival_uncertainty": stu.arrival.uncertainty if stu.HasField("arrival") else None,

            "departure_delay": stu.departure.delay if stu.HasField("departure") else None,
            "departure_time": stu.departure.time if stu.HasField("departure") else None,
            "departure_uncertainty": stu.departure.uncertainty if stu.HasField("departure") else None,
        }


In [19]:
def enrich_row_inplace_koda(row, lookup, id_key):
    """
    Enrichis le dictionnaire avec une autre dictionnaire basé sur un id. 
    `row`: ligne à adapté avec la référence, 
    `lookup`: dictionnaire d'index,
    `id_key`: La clef sur la quel on va merger les datas.
    """
    ref = lookup.get(row[id_key])
    if ref:
        row.update(ref)
    return row

def enrich_many_inplace_koda(row, lookups):
    """
    lookups = liste de tuples: (id_key, lookup_dict)
    ex: [("trip_id", trip_lookup), ("route_id", route_lookup)]
    """
    for id_key, lookup in lookups:
        key_val = row.get(id_key)
        if not key_val:
            continue
        ref = lookup.get(key_val)
        if ref:
            row.update(ref)
    return row


In [20]:
def iter_history_row_koda(entities, lookups):
    """
    Fais une boucle sur l'applatissemnt des données et le merge
    `entities`: les données,
    `lookup`: dictionnaire d'index,
    `id_key`: La clef sur la quel on va merger les datas.
    """
    for e in entities:
        # Applatit les données
        for row in flatten_history_entity_koda(e):
            # Enrichis/merge les données
            yield enrich_many_inplace_koda(row, lookups)

In [29]:
history_lookups = [
    ("trip_id", ref_trips_corr),
    ("route_id", ref_routes_corr),
#    ("shape_id", ref_shapes_corr),
]
history_data = iter_history_row_koda(history_entities, history_lookups)

In [30]:
print(next(history_data))

{'entity_id': '14010516613288742', 'trip_id': '14010000668271329', 'start_date': '20250316', 'vehicle_id': '9031001004302576', 'feed_ts': 1742165942, 'schedule_relationship': 0, 'stop_id': '9022001006071004', 'stop_sequence': 25, 'arrival_delay': -62, 'arrival_time': 1742165338, 'arrival_uncertainty': 0, 'departure_delay': 5, 'departure_time': 1742165405, 'departure_uncertainty': 0, 'route_id': '9011001004300000', 'direction_id': '0', 'route_short_name': '43', 'route_type': '100'}


In [23]:
# ATTENTION HISTORY DATA N'AFFICHE PAS ROUTES ET TRIPS, il n'affiche que les data brutes
# Recupérer 2 bus par heure c'est suffisant (meme numéro de bus)
# Et faire pour 365 jours et pas que 1 jours

In [24]:
#from itertools import islice

#sample = list(islice(history_data, 2000))
#df_sample = pd.DataFrame(sample)
#df_sample


In [25]:
#df_sample[df_sample['route_type'] == "700"].groupby('route_short_name').count().sort_values(by="route_id", ascending=False)

In [26]:
##from itertools import islice

#bus_number = 18

#filtered_rows = (
    #row for row in history_data
    #if row.get("route_short_name") == bus_number
#)

#df_bus_18 = pd.DataFrame(filtered_rows)


In [None]:
#df_bus_18

In [31]:
bus_number = "18"

rows_bus_18 = [
    row for row in history_data
    if row.get("route_short_name") is not None
    and str(row.get("route_short_name")) == bus_number
]

In [None]:
print(history_data)

<generator object iter_history_row_koda at 0x1a8390f40>


In [32]:
rows_bus_18

[{'entity_id': '14010516613288742',
  'trip_id': '14010000644871372',
  'start_date': '20250316',
  'vehicle_id': '9031001002500145',
  'feed_ts': 1742165987,
  'schedule_relationship': 0,
  'stop_id': '9022001001011001',
  'stop_sequence': 13,
  'arrival_delay': -68,
  'arrival_time': 1742165302,
  'arrival_uncertainty': 0,
  'departure_delay': 58,
  'departure_time': 1742165428,
  'departure_uncertainty': 0,
  'route_id': '9011001001800000',
  'direction_id': '1',
  'route_short_name': '18',
  'route_type': '401'},
 {'entity_id': '14010516613288742',
  'trip_id': '14010000644871372',
  'start_date': '20250316',
  'vehicle_id': '9031001002500145',
  'feed_ts': 1742165987,
  'schedule_relationship': 0,
  'stop_id': '9022001001021001',
  'stop_sequence': 14,
  'arrival_delay': 8,
  'arrival_time': 1742165468,
  'arrival_uncertainty': 0,
  'departure_delay': 52,
  'departure_time': 1742165512,
  'departure_uncertainty': 0,
  'route_id': '9011001001800000',
  'direction_id': '1',
  'route