In [None]:
import pandas as pd
import requests
import json
import os
import io
import py7zr
from google.transit import gtfs_realtime_pb2
import tempfile
from datetime import datetime
from pathlib import Path
from itertools import islice
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv()

In [None]:
api_GTS_RT_KEY = os.getenv("API_GTFS_RT_KEY")
api_koda_key = os.getenv("API_KODA_KEY")


host_api = "https://api.koda.trafiklab.se/KoDa/api/v2"
base_path = "/gtfs-rt"
location_path = "/sl"
type_path = "/TripUpdates"
api_url = host_api + base_path + location_path + type_path

params = {
    "date": "2025-04-09", 
    "key": api_koda_key
}

In [3]:
r = requests.get(f"{api_url}", params=params, timeout=20)

print(r)
print("---"*3)
print(len(r.content), "Bytes")
print("---"*3)
print("Status:", r.status_code)
print("Content-Type:", r.headers.get("Content-Type"))
print("Content-Encoding:", r.headers.get("Content-Encoding"))
print("Début (bytes):", r.content[:8])

<Response [200]>
---------
72638308 Bytes
---------
Status: 200
Content-Type: application/x-7z-compressed
Content-Encoding: None
Début (bytes): b"7z\xbc\xaf'\x1c\x00\x04"


In [8]:
archive_bytes = io.BytesIO(r.content)

with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
    names = z.getnames()

# 1) Essayer d'abord les extensions classiques
candidates = [n for n in names if n.lower().endswith((".pb"))] # autres format possible mais non trouvé ".pbf", ".bin", ".protobuf"

# 2) Si pas d'extension, on détecte les "vrais fichiers" :
if not candidates:
    name_set = set(names)
    candidates = []
    for n in names:
        # si "n/<quelquechose>" existe, alors n est un dossier/prefixe
        is_prefix = any(other.startswith(n + "/") for other in names)
        if not is_prefix:
            candidates.append(n)

print("Total entrées:", len(names))
print("Candidats fichiers:", len(candidates))
print("Exemples candidats:", candidates[:10])


Total entrées: 6088
Candidats fichiers: 6059
Exemples candidats: ['sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-08T23-59-40Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-08T23-59-54Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-00-08Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-00-22Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-00-36Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-00-50Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-01-04Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-01-18Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-01-32Z.pb', 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-01-46Z.pb']


In [None]:
# FAIT LE TOUR D4UNE JOURNEE POUR UNE LIGNE DE BUS
ROUTE_ID = "9011001001800000" # ligne 18
BATCH_SIZE = 200

def chunked(iterable, size):
    it = iter(iterable)
    while True:
        chunk = list(islice(it, size))
        if not chunk:
            return
        yield chunk

def safe_unlink(p: Path):
    try:
        p.unlink()
    except FileNotFoundError:
        pass

archive_bytes = io.BytesIO(r.content)
archive_bytes.seek(0)

tmpdir = tempfile.mkdtemp(prefix="gtfsrt_")
tmp = Path(tmpdir)

feed = gtfs_realtime_pb2.FeedMessage()
records = defaultdict(list)
bad_files = []

with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
    candidates = [n for n in z.getnames() if n.endswith(".pb")]

    for batch in chunked(candidates, BATCH_SIZE):
        # 1) tentative extraction batch (rapide)
        try:
            z.extract(path=tmpdir, targets=batch)
            extracted = batch
        except Exception as e:
            # 2) si CRC error (ou autre), on tente fichier par fichier pour isoler le(s) mauvais
            extracted = []
            for one in batch:
                try:
                    z.extract(path=tmpdir, targets=[one])
                    extracted.append(one)
                except Exception as e_one:
                    bad_files.append((one, repr(e_one)))
                    # on skip ce fichier
                    continue

        # 3) parser ce qui a été extrait
        for member in extracted:
            p = tmp / member
            if not p.exists():
                continue

            raw = p.read_bytes()
            feed.Clear()
            feed.ParseFromString(raw)

            ts = int(feed.header.timestamp) if feed.header.HasField("timestamp") else 0

            for e in feed.entity:
                if not e.HasField("trip_update"):
                    continue
                tu = e.trip_update
                if tu.trip.route_id != ROUTE_ID:
                    continue

                trip_id = tu.trip.trip_id
                start_date = tu.trip.start_date

                for stu in tu.stop_time_update:
                    records[ts].append({
                        "ts": ts,
                        "trip_id": trip_id,
                        "start_date": start_date,
                        "stop_id": stu.stop_id,
                        "arr_delay": stu.arrival.delay if stu.HasField("arrival") and stu.arrival.HasField("delay") else None,
                        "dep_delay": stu.departure.delay if stu.HasField("departure") and stu.departure.HasField("delay") else None,
                    })

            safe_unlink(p)

print("✅ Terminé.")
print("Bad files:", len(bad_files))
if bad_files:
    print("Exemples bad files:")
    for bf in bad_files[:10]:
        print(bf[0], bf[1])

# Affichage trié
total = sum(len(v) for v in records.values())
print(f"Route {ROUTE_ID} -> {total} enregistrements")
for ts in sorted(records.keys()):
    print(ts, "->", len(records[ts]))


✅ Terminé.
Bad files: 5858
Exemples bad files:
sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-47-02Z.pb CrcError(2607712406, 2657475990, 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-08T23-59-40Z.pb')
sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-47-16Z.pb CrcError(1546658743, 2657475990, 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-08T23-59-40Z.pb')
sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-47-30Z.pb CrcError(4152953785, 2657475990, 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-08T23-59-40Z.pb')
sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-47-44Z.pb CrcError(2380286395, 2657475990, 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-08T23-59-40Z.pb')
sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-47-58Z.pb CrcError(2984422355, 2657475990, 'sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-08T23-59-40Z.pb')
sl/TripUpdates/2025/04/09/00/sl-tripupdates-2025-04-09T00-48-12Z.pb CrcError(3486764571, 2657

In [12]:
print("Nombre de timestamps:", len(records))

Nombre de timestamps: 142


In [None]:
for ts, rows in records.items():
    print("\n" + "=" * 80)
    print(f"Timestamp: {ts}  |  nb lignes: {len(rows)}")
    print("=" * 80)

    for row in rows:
        print(row)


Timestamp: 1744149601  |  nb lignes: 1
{'ts': 1744149601, 'trip_id': '', 'start_date': '20250408', 'stop_id': '9022001001881001', 'arr_delay': -3161, 'dep_delay': -3161}

Timestamp: 1744149618  |  nb lignes: 2
{'ts': 1744149618, 'trip_id': '', 'start_date': '20250408', 'stop_id': '9022001001881001', 'arr_delay': -3161, 'dep_delay': -3161}
{'ts': 1744149618, 'trip_id': '', 'start_date': '20250408', 'stop_id': '9022001001881001', 'arr_delay': -3161, 'dep_delay': -3161}

Timestamp: 1744149636  |  nb lignes: 1
{'ts': 1744149636, 'trip_id': '', 'start_date': '20250408', 'stop_id': '9022001001881001', 'arr_delay': -3161, 'dep_delay': -3161}

Timestamp: 1744149653  |  nb lignes: 1
{'ts': 1744149653, 'trip_id': '', 'start_date': '20250408', 'stop_id': '9022001001881001', 'arr_delay': -3161, 'dep_delay': -3161}

Timestamp: 1744149671  |  nb lignes: 2
{'ts': 1744149671, 'trip_id': '', 'start_date': '20250408', 'stop_id': '9022001001881001', 'arr_delay': -3161, 'dep_delay': -3161}
{'ts': 1744149

In [None]:
rows = []

for e in feed.entity:
    if not e.HasField("trip_update"):
        continue

    tu = e.trip_update

    tr = tu.trip
    
    for stu in tu.stop_time_update:
        row = {
            "entity_id": e.id,
            "trip_id": tr.trip_id,
            "route_id": tr.route_id,
            "start_date": tr.start_date,
            "schedule_relationship": tr.schedule_relationship,
            "vehicle_id": tu.vehicle.id if tu.HasField("vehicle") else None,

            "stop_sequence": stu.stop_sequence,
            "stop_id": stu.stop_id,

            "stop_arrival_delay": stu.arrival.delay if stu.HasField("arrival") else None,
            "stop_arrival_time": stu.arrival.time if stu.HasField("arrival") else None,

            "departure_delay": stu.departure.delay if stu.HasField("departure") else None,
            "departure_time": stu.departure.time if stu.HasField("departure") else None,

            "timestamp": tu.timestamp if tu.timestamp else None
        }
        rows.append(row)

df = pd.DataFrame(rows)

In [7]:
df

Unnamed: 0,entity_id,trip_id,route_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,departure_delay,departure_time,timestamp
0,14010516766425338,14010000664756938,,20250409,0,9031001004505106,71,9022001060014001,120.0,1.744182e+09,134.0,1.744182e+09,1744182368
1,14010516766425338,14010000664756938,,20250409,0,9031001004505106,72,9022001050366002,110.0,1.744182e+09,129.0,1.744182e+09,1744182368
2,14010516766425338,14010000664756938,,20250409,0,9031001004505106,73,9022001051554002,-66.0,1.744182e+09,-66.0,1.744182e+09,1744182368
3,14010516766425338,14010000664756938,,20250409,0,9031001004505106,74,9022001060301004,-116.0,1.744182e+09,-98.0,1.744182e+09,1744182368
4,14010516766425338,14010000664756938,,20250409,0,9031001004505106,75,9022001069011002,-127.0,1.744182e+09,-127.0,1.744182e+09,1744182368
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20511,14010516552820808,14010000670834912,,20250409,0,9031001004505524,18,9022001062213001,0.0,1.744184e+09,0.0,1.744184e+09,1744182362
20512,14010516552820808,14010000670834912,,20250409,0,9031001004505524,19,9022001062227001,0.0,1.744184e+09,0.0,1.744184e+09,1744182362
20513,14010516552820808,14010000670834912,,20250409,0,9031001004505524,20,9022001062228002,0.0,1.744184e+09,0.0,1.744184e+09,1744182362
20514,14010516552820808,14010000670834912,,20250409,0,9031001004505524,21,9022001062226002,0.0,1.744184e+09,0.0,1.744184e+09,1744182362


In [8]:
df["stop_arrival_time"] = pd.to_datetime(df["stop_arrival_time"], unit="s", utc=True)
df["departure_time"] = pd.to_datetime(df["departure_time"], unit="s", utc=True)
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["start_date"] = pd.to_datetime(df["start_date"], unit="s", utc=True)

df.head(10)

  df["start_date"] = pd.to_datetime(df["start_date"], unit="s", utc=True)


Unnamed: 0,entity_id,trip_id,route_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,departure_delay,departure_time,timestamp
0,14010516766425338,14010000664756938,,1970-08-23 09:06:49+00:00,0,9031001004505106,71,9022001060014001,120.0,2025-04-09 06:56:50+00:00,134.0,2025-04-09 06:57:04+00:00,2025-04-09 07:06:08+00:00
1,14010516766425338,14010000664756938,,1970-08-23 09:06:49+00:00,0,9031001004505106,72,9022001050366002,110.0,2025-04-09 06:58:50+00:00,129.0,2025-04-09 06:59:09+00:00,2025-04-09 07:06:08+00:00
2,14010516766425338,14010000664756938,,1970-08-23 09:06:49+00:00,0,9031001004505106,73,9022001051554002,-66.0,2025-04-09 07:02:44+00:00,-66.0,2025-04-09 07:02:44+00:00,2025-04-09 07:06:08+00:00
3,14010516766425338,14010000664756938,,1970-08-23 09:06:49+00:00,0,9031001004505106,74,9022001060301004,-116.0,2025-04-09 07:03:29+00:00,-98.0,2025-04-09 07:03:47+00:00,2025-04-09 07:06:08+00:00
4,14010516766425338,14010000664756938,,1970-08-23 09:06:49+00:00,0,9031001004505106,75,9022001069011002,-127.0,2025-04-09 07:04:58+00:00,-127.0,2025-04-09 07:04:58+00:00,2025-04-09 07:06:08+00:00
5,14010516766425338,14010000664756938,,1970-08-23 09:06:49+00:00,0,9031001004505106,76,9022001069009002,-146.0,2025-04-09 07:05:53+00:00,-146.0,2025-04-09 07:05:53+00:00,2025-04-09 07:06:08+00:00
6,14010516766425338,14010000664756938,,1970-08-23 09:06:49+00:00,0,9031001004505106,77,9022001060080001,-324.0,2025-04-09 07:09:36+00:00,-322.0,2025-04-09 07:09:38+00:00,2025-04-09 07:06:08+00:00
7,14010516479399292,14010000668237873,,1970-08-23 09:06:49+00:00,0,9031001004002216,25,9022001005007002,86.0,2025-04-09 07:05:26+00:00,86.0,2025-04-09 07:05:26+00:00,2025-04-09 07:05:06+00:00
8,14010516900582529,14010000684355399,,1970-08-23 09:06:49+00:00,0,9031001004302315,21,9022001006231001,280.0,2025-04-09 06:56:40+00:00,294.0,2025-04-09 06:57:54+00:00,2025-04-09 07:01:46+00:00
9,14010516900582529,14010000684355399,,1970-08-23 09:06:49+00:00,0,9031001004302315,22,9022001006241001,207.0,2025-04-09 06:59:27+00:00,274.0,2025-04-09 07:00:34+00:00,2025-04-09 07:01:46+00:00


In [9]:
df_with_route_id = df[df["route_id"] != ""]
df_with_route_id

Unnamed: 0,entity_id,trip_id,route_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,departure_delay,departure_time,timestamp
17,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,1,9022001001951002,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
18,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,2,9022001001951001,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
19,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,3,9022001001941001,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
20,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,4,9022001001941002,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
21,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,5,9022001001931001,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,114,9022001001311002,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
131,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,115,9022001001321001,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
132,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,116,9022001001321002,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00
133,14050001802168556,,9011001001800000,1970-08-23 09:06:49+00:00,3,9031001002500110,117,9022001001331001,0.0,2025-04-09 05:36:02+00:00,0.0,2025-04-09 05:36:02+00:00,2025-04-09 05:36:06+00:00


In [10]:
df_with_route_id["route_id"].unique()

array(['9011001001800000'], dtype=object)