In [2]:
import pandas as pd
import requests
import json
import os
import io
import py7zr
import zipfile
import csv
#from google.transit import gtfs_realtime_pb2
import tempfile
from datetime import datetime
from pathlib import Path
from itertools import islice
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
KODA_KEY = os.getenv("API_KODA_KEY")
GTFS_RT_KEY = os.getenv("API_GTFS_RT_KEY")  # clé GTFS Regional Realtime
GTFS_REGIONAL_STATIC_KEY = os.getenv("GTFS_REGIONAL_STATIC_KEY")  # clé GTFS Regional Static

In [4]:
# IL EST NECESSAIRE DE RECUPERER LES IDS DES FICHIERS STATIC TOUS LES JOURS CAR LES IDS CHANGENT

In [5]:
# API parameters
host_api = "https://api.koda.trafiklab.se/KoDa/api/v2"

base_rt = "/gtfs-rt"
base_static = "/gtfs-static"
operator = "/sl"
type_path = "/TripUpdates"
url_rt = host_api + base_rt + operator + type_path
url_static = host_api + base_static + operator

params = {
    "date": "2025-03-15", 
    "key": KODA_KEY
}

In [6]:
# The request
request_rt = requests.get(f"{url_rt}", params=params, timeout=20)

print(request_rt)
print("---"*3)
print(len(request_rt.content), "Bytes")
print("---"*3)
print("Status:", request_rt.status_code)
print("Content-Type:", request_rt.headers.get("Content-Type"))
print("Content-Encoding:", request_rt.headers.get("Content-Encoding"))
print("Début (bytes):", request_rt.content[:8])

<Response [200]>
---------
41448769 Bytes
---------
Status: 200
Content-Type: application/x-7z-compressed
Content-Encoding: None
Début (bytes): b"7z\xbc\xaf'\x1c\x00\x04"


In [7]:
# The request
request_static = requests.get(f"{url_static}", params=params, timeout=20)

print(request_static)
print("---"*3)
print(len(request_static.content), "Bytes")
print("---"*3)
print("Status:", request_static.status_code)
print("Content-Type:", request_static.headers.get("Content-Type"))
print("Content-Encoding:", request_static.headers.get("Content-Encoding"))
print("Début (bytes):", request_static.content[:8])

<Response [200]>
---------
52438156 Bytes
---------
Status: 200
Content-Type: application/zip
Content-Encoding: None
Début (bytes): b'PK\x03\x04\x14\x00\x08\x08'


In [8]:
import io, tempfile, shutil
from pathlib import Path
import copy
import py7zr
from google.transit import gtfs_realtime_pb2
from google.protobuf.message import DecodeError
import logging
# ICI CA FONCTIONNE EN 3 MINUTES POUR UN JOUR

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S"
)

logger = logging.getLogger("gtfs")

archive_bytes = io.BytesIO(request_rt.content)

BATCH = 500  # ajuste: 200-800 selon ton disque/RAM

tmpdir = tempfile.mkdtemp(prefix="koda_")
tmp = Path(tmpdir)

rt_entities = []
bad_files = []

feed = gtfs_realtime_pb2.FeedMessage()

# 1) liste des candidats
archive_bytes.seek(0)
with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
    candidates = [n for n in z.getnames() if n.lower().endswith(".pb")]

print("Nb fichiers .pb:", len(candidates))

# 2) extraction + parse par batch
for i in range(0, len(candidates), BATCH):
    batch = candidates[i:i+BATCH]

    logger.info(
        "Batch %d–%d / %d (%.1f%%)",
        i + 1,
        min(i + BATCH, len(candidates)),
        len(candidates),
        100 * (i + len(batch)) / len(candidates)
    )

    try:
        archive_bytes.seek(0)
        with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
            z.extract(path=tmpdir, targets=batch)
    except Exception as e:
        # si un batch plante, on retombe en mode "un par un" juste pour ce batch
        for name in batch:
            try:
                archive_bytes.seek(0)
                with py7zr.SevenZipFile(archive_bytes, mode="r") as z:
                    z.extract(path=tmpdir, targets=[name])
            except Exception as e2:
                bad_files.append((name, f"ExtractError: {e2!r}"))
        continue

    for name in batch:
        p = tmp / name
        try:
            raw = p.read_bytes()
            feed.Clear()
            try:
                feed.ParseFromString(raw)
            except DecodeError as de:
                bad_files.append((name, f"DecodeError: {de!r}"))
                continue

            for entity in feed.entity:
                rt_entities.append(copy.deepcopy(entity))

        except Exception as e:
            bad_files.append((name, f"Read/ParseError: {e!r}"))
        finally:
            p.unlink(missing_ok=True)

shutil.rmtree(tmpdir, ignore_errors=True)

print("✅ Total entités enregistrées:", len(rt_entities))
print("⚠️ Fichiers ignorés:", len(bad_files))
print("Exemples:", bad_files[:5])


17:11:48 | INFO | Batch 1–500 / 6031 (8.3%)


Nb fichiers .pb: 6031


17:11:51 | INFO | Batch 501–1000 / 6031 (16.6%)
17:11:53 | INFO | Batch 1001–1500 / 6031 (24.9%)
17:11:55 | INFO | Batch 1501–2000 / 6031 (33.2%)
17:11:58 | INFO | Batch 2001–2500 / 6031 (41.5%)
17:12:04 | INFO | Batch 2501–3000 / 6031 (49.7%)
17:12:11 | INFO | Batch 3001–3500 / 6031 (58.0%)
17:12:19 | INFO | Batch 3501–4000 / 6031 (66.3%)
17:12:30 | INFO | Batch 4001–4500 / 6031 (74.6%)
17:12:40 | INFO | Batch 4501–5000 / 6031 (82.9%)
17:12:49 | INFO | Batch 5001–5500 / 6031 (91.2%)
17:12:59 | INFO | Batch 5501–6000 / 6031 (99.5%)
17:13:09 | INFO | Batch 6001–6031 / 6031 (100.0%)


✅ Total entités enregistrées: 3113034
⚠️ Fichiers ignorés: 0
Exemples: []


In [9]:
#AFFICHER TOUTES LES COLONNES 
print(rt_entities[:1])

[id: "14010516897425752"
trip_update {
  trip {
    trip_id: "14010000685561305"
    start_date: "20250314"
    schedule_relationship: SCHEDULED
  }
  vehicle {
    id: "9031001003003885"
  }
  stop_time_update {
    stop_sequence: 1
    stop_id: "9022001041441002"
    arrival {
      delay: -131
      time: 1741983709
      uncertainty: 0
    }
    departure {
      delay: 9018
      time: 1741992858
      uncertainty: 0
    }
  }
  timestamp: 1741992858
}
]


In [10]:
rows = []

for e in rt_entities[:100000]: #ATTENTION JE NE LE FAIS QUE SUR LES 10 000 premiers
    if not e.HasField("trip_update"):
        continue

    tu = e.trip_update

    tr = tu.trip
    
    for stu in tu.stop_time_update:
        row = {
            "entity_id": e.id,
            "trip_id": tr.trip_id,
            "start_date": tr.start_date,
            "schedule_relationship": tr.schedule_relationship, #VERIFIER QUE JE RECOIS BIEN 
            "vehicle_id": tu.vehicle.id if tu.HasField("vehicle") else None,

            "stop_sequence": stu.stop_sequence,
            "stop_id": stu.stop_id,

            "stop_arrival_delay": stu.arrival.delay if stu.HasField("arrival") else None,
            "stop_arrival_time": stu.arrival.time if stu.HasField("arrival") else None,
            "stop_uncertainty": stu.arrival.uncertainty if stu.HasField("arrival") else None,

            "departure_delay": stu.departure.delay if stu.HasField("departure") else None,
            "departure_time": stu.departure.time if stu.HasField("departure") else None,
            "departure_uncertainty": stu.departure.uncertainty if stu.HasField("departure") else None,

            "timestamp": tu.timestamp if tu.timestamp else None
        }
        rows.append(row)

df_rt = pd.DataFrame(rows)

In [11]:
df_rt.sample(20)

Unnamed: 0,entity_id,trip_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,stop_uncertainty,departure_delay,departure_time,departure_uncertainty,timestamp
1341153,14010516719104362,1.4010000671939376e+16,20250315,0,9031001004505158.0,30,9022001050501008,75.0,1741997000.0,0.0,92.0,1741997000.0,0.0,1741996782
136371,14010516701110999,1.4010000673650788e+16,20250314,0,9031001003007708.0,33,9022001080101001,153.0,1741993000.0,0.0,166.0,1741993000.0,0.0,1741993498
796935,14010516716709946,1.4010000674006898e+16,20250315,0,9031001004505180.0,21,9022001060153001,287.0,1741996000.0,0.0,294.0,1741996000.0,0.0,1741995117
1078478,14010516403339180,1.401000060326908e+16,20250314,0,9031001002500002.0,34,9022001001321001,107.0,1741996000.0,0.0,164.0,1741996000.0,0.0,1741995974
267240,14010516612458431,1.4010000621493472e+16,20250315,0,9031001003003134.0,30,9022001084014001,-26.0,1741996000.0,0.0,-26.0,1741996000.0,0.0,1741993784
1043280,14010516511712441,1.4010000670440972e+16,20250315,0,9031001003003580.0,28,9022001054009002,-399.0,1741998000.0,0.0,-145.0,1741998000.0,0.0,1741995882
713583,14010516766215642,1.4010000664779694e+16,20250315,0,9031001004505140.0,34,9022001064067001,-64.0,1741996000.0,0.0,-64.0,1741996000.0,0.0,1741994929
1517260,14010516898376252,1.4010000685705844e+16,20250315,0,9031001003000134.0,33,9022001040019002,116.0,1741998000.0,0.0,120.0,1741998000.0,0.0,1741997428
534892,14010516612246059,1.4010000669105106e+16,20250315,0,9031001003002514.0,14,9022001080785002,54.0,1741995000.0,0.0,54.0,1741995000.0,0.0,1741994481
541601,14050001795047778,,20250315,1,9031001002520044.0,2,9022001002251002,111.0,1741994000.0,0.0,169.0,1741994000.0,0.0,1741994485


In [12]:
#import io
#import zipfile
# OUVERTURE DU ZIP POUR LES .TXT
#archive_bytes = io.BytesIO(request_static.content)

#with zipfile.ZipFile(archive_bytes, "r") as z:
    #names = z.namelist()

#print("Total entrées:", len(names))
#print("Exemples:", names[:10])


In [13]:


#with zipfile.ZipFile(archive_bytes, "r") as z:
 #   with z.open("trips.txt") as f:
  #      reader = csv.DictReader(io.TextIOWrapper(f, encoding="utf-8"))
   #     trips = list(reader)

#print("Nombre de trips:", len(trips))
#print("Exemple:", trips[0])


In [14]:
archive_bytes_static = io.BytesIO(request_static.content)

with zipfile.ZipFile(archive_bytes_static, "r") as z:
    with z.open("trips.txt") as f:
        df_trips = pd.read_csv(f)

df_trips


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,shape_id
0,9011001000100000,1,14010000664282006,,1,1014010000482329256
1,9011001000100000,1,14010000664304117,,1,1014010000482329256
2,9011001000100000,1,14010000664309301,,1,1014010000482329256
3,9011001000100000,1,14010000664312304,,1,1014010000482329256
4,9011001000100000,1,14010000664315369,,1,1014010000482329256
...,...,...,...,...,...,...
77693,9011008003100000,555,14010000654851031,,0,6014010000508859291
77694,9011008003100000,555,14010000685458042,,0,6014010000685457806
77695,9011008003100000,555,14010000685457984,,0,6014010000191938186
77696,9011008003100000,555,14010000685458008,,0,6014010000348824586


In [None]:
df_rt["trip_id"] = df_rt["trip_id"].astype("string")
df_trips["trip_id"] = df_trips["trip_id"].astype("string")

df_merge_on_trip = df_rt.merge(df_trips, on="trip_id", how="left")
df_merge_on_trip

Unnamed: 0,entity_id,trip_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,stop_uncertainty,departure_delay,departure_time,departure_uncertainty,timestamp,route_id,service_id,trip_headsign,direction_id,shape_id
0,14010516897425752,14010000685561305,20250314,0,9031001003003885,1,9022001041441002,-131.0,1.741984e+09,0.0,9018.0,1.741993e+09,0.0,1741992858,9011001043300000,559.0,,0.0,1.014010e+18
1,14010516479723124,14010000668271329,20250314,0,9031001004302576,25,9022001006071004,-65.0,1.741993e+09,0.0,9.0,1.741993e+09,0.0,1741993168,9011001004300000,440.0,,0.0,4.014010e+18
2,14010516479723124,14010000668271329,20250314,0,9031001004302576,26,9022001006081002,-66.0,1.741993e+09,0.0,31.0,1.741993e+09,0.0,1741993168,9011001004300000,440.0,,0.0,4.014010e+18
3,14010516479723124,14010000668271329,20250314,0,9031001004302576,27,9022001006091002,-30.0,1.741993e+09,0.0,0.0,1.741993e+09,0.0,1741993168,9011001004300000,440.0,,0.0,4.014010e+18
4,14010516479723124,14010000668271329,20250314,0,9031001004302576,28,9022001006101001,0.0,1.741994e+09,0.0,0.0,1.741994e+09,0.0,1741993168,9011001004300000,440.0,,0.0,4.014010e+18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590444,14010516901444404,14010000686114566,20250315,0,,22,9022001004541001,-27.0,1.741997e+09,0.0,17.0,1.741997e+09,0.0,1741997686,9011001003000000,54.0,,1.0,2.014010e+18
1590445,14010516901444404,14010000686114566,20250315,0,,23,9022001004543001,11.0,1.741997e+09,0.0,47.0,1.741997e+09,0.0,1741997686,9011001003000000,54.0,,1.0,2.014010e+18
1590446,14010516901444404,14010000686114566,20250315,0,,24,9022001004545001,28.0,1.741998e+09,0.0,45.0,1.741998e+09,0.0,1741997686,9011001003000000,54.0,,1.0,2.014010e+18
1590447,14010516901444404,14010000686114566,20250315,0,,25,9022001004547001,47.0,1.741998e+09,0.0,86.0,1.741998e+09,0.0,1741997686,9011001003000000,54.0,,1.0,2.014010e+18


In [16]:
archive_bytes = io.BytesIO(request_static.content)

with zipfile.ZipFile(archive_bytes, "r") as z:
    with z.open("routes.txt") as f:
        df_routes = pd.read_csv(f)

df_routes

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_desc
0,9011001000100000,14010000000001001,1,,700,blåbuss
1,9011001000200000,14010000000001001,2,,700,blåbuss
2,9011001000300000,14010000000001001,3,,700,blåbuss
3,9011001000400000,14010000000001001,4,,700,blåbuss
4,9011001000600000,14010000000001001,6,,700,blåbuss
...,...,...,...,...,...,...
591,9011008002700000,14010000000002071,27,,1000,Waxholmsbolaget
592,9011008002800000,14010000000002071,28,,1000,Waxholmsbolaget
593,9011008002900000,14010000000002071,29,,1000,Waxholmsbolaget
594,9011008003000000,14010000000002071,30,,1000,Waxholmsbolaget


In [17]:
df_merge_on_route = df_merge_on_trip.merge(df_routes, on="route_id", how="left")
df_merge_on_route

Unnamed: 0,entity_id,trip_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,stop_uncertainty,...,route_id,service_id,trip_headsign,direction_id,shape_id,agency_id,route_short_name,route_long_name,route_type,route_desc
0,14010516897425752,14010000685561305,20250314,0,9031001003003885,1,9022001041441002,-131.0,1.741984e+09,0.0,...,9011001043300000,559.0,,0.0,1.014010e+18,1.401000e+16,433,,700.0,
1,14010516479723124,14010000668271329,20250314,0,9031001004302576,25,9022001006071004,-65.0,1.741993e+09,0.0,...,9011001004300000,440.0,,0.0,4.014010e+18,1.401000e+16,43,,100.0,Pendeltåg
2,14010516479723124,14010000668271329,20250314,0,9031001004302576,26,9022001006081002,-66.0,1.741993e+09,0.0,...,9011001004300000,440.0,,0.0,4.014010e+18,1.401000e+16,43,,100.0,Pendeltåg
3,14010516479723124,14010000668271329,20250314,0,9031001004302576,27,9022001006091002,-30.0,1.741993e+09,0.0,...,9011001004300000,440.0,,0.0,4.014010e+18,1.401000e+16,43,,100.0,Pendeltåg
4,14010516479723124,14010000668271329,20250314,0,9031001004302576,28,9022001006101001,0.0,1.741994e+09,0.0,...,9011001004300000,440.0,,0.0,4.014010e+18,1.401000e+16,43,,100.0,Pendeltåg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590444,14010516901444404,14010000686114566,20250315,0,,22,9022001004541001,-27.0,1.741997e+09,0.0,...,9011001003000000,54.0,,1.0,2.014010e+18,1.401000e+16,30,Tvärbanan,900.0,Tvärbanan
1590445,14010516901444404,14010000686114566,20250315,0,,23,9022001004543001,11.0,1.741997e+09,0.0,...,9011001003000000,54.0,,1.0,2.014010e+18,1.401000e+16,30,Tvärbanan,900.0,Tvärbanan
1590446,14010516901444404,14010000686114566,20250315,0,,24,9022001004545001,28.0,1.741998e+09,0.0,...,9011001003000000,54.0,,1.0,2.014010e+18,1.401000e+16,30,Tvärbanan,900.0,Tvärbanan
1590447,14010516901444404,14010000686114566,20250315,0,,25,9022001004547001,47.0,1.741998e+09,0.0,...,9011001003000000,54.0,,1.0,2.014010e+18,1.401000e+16,30,Tvärbanan,900.0,Tvärbanan


In [18]:
df_merge_on_route.groupby("route_short_name").count().sort_values(by="entity_id", ascending=False)

Unnamed: 0_level_0,entity_id,trip_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,stop_uncertainty,...,timestamp,route_id,service_id,trip_headsign,direction_id,shape_id,agency_id,route_long_name,route_type,route_desc
route_short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19,46563,46563,46563,46563,46563,46563,46563,46563,46563,46563,...,46563,46563,46563,0,46563,46563,46563,46563,46563,46563
30,42015,42015,42015,42015,0,42015,42015,42015,42015,42015,...,42015,42015,42015,0,42015,42015,42015,42015,42015,42015
13,35659,35659,35659,35659,35659,35659,35659,35659,35659,35659,...,35659,35659,35659,0,35659,35659,35659,35659,35659,35659
43,31159,31159,31159,31159,31159,31159,31159,30737,30737,30737,...,31159,31159,31159,0,31159,31159,31159,0,31159,31159
14,27529,27529,27529,27529,27529,27529,27529,27529,27529,27529,...,27529,27529,27529,0,27529,27529,27529,27529,27529,27529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25F,260,260,260,260,260,260,260,260,260,260,...,260,260,260,0,260,260,260,260,260,260
127,256,256,256,256,256,256,256,256,256,256,...,256,256,256,0,256,256,256,0,256,0
436,167,167,167,167,167,167,167,167,167,167,...,167,167,167,0,167,167,167,0,167,0
80,83,83,83,83,83,83,83,83,83,83,...,83,83,83,0,83,83,83,0,83,83


In [19]:
df_bus = df_merge_on_route[df_merge_on_route["route_type"] == 700]
df_bus

Unnamed: 0,entity_id,trip_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,stop_uncertainty,...,route_id,service_id,trip_headsign,direction_id,shape_id,agency_id,route_short_name,route_long_name,route_type,route_desc
0,14010516897425752,14010000685561305,20250314,0,9031001003003885,1,9022001041441002,-131.0,1.741984e+09,0.0,...,9011001043300000,559.0,,0.0,1.014010e+18,1.401000e+16,433,,700.0,
18,14010516701138057,14010000673650562,20250314,0,9031001003007242,56,9022001080389001,354.0,1.741993e+09,0.0,...,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,,700.0,
19,14010516701138057,14010000673650562,20250314,0,9031001003007242,57,9022001080391001,380.0,1.741993e+09,0.0,...,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,,700.0,
20,14010516701138057,14010000673650562,20250314,0,9031001003007242,58,9022001080393001,375.0,1.741993e+09,0.0,...,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,,700.0,
21,14010516701138057,14010000673650562,20250314,0,9031001003007242,59,9022001080395001,368.0,1.741993e+09,0.0,...,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,,700.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590437,14010516719104362,14010000671939377,20250315,0,9031001004505158,45,9022001010346002,123.0,1.741998e+09,0.0,...,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,,700.0,
1590438,14010516719104362,14010000671939377,20250315,0,9031001004505158,46,9022001010357001,94.0,1.741998e+09,0.0,...,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,,700.0,
1590439,14010516719104362,14010000671939377,20250315,0,9031001004505158,47,9022001010359003,86.0,1.741998e+09,0.0,...,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,,700.0,
1590440,14010516719104362,14010000671939377,20250315,0,9031001004505158,48,9022001010363008,51.0,1.741998e+09,0.0,...,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,,700.0,


In [20]:
archive_bytes = io.BytesIO(request_static.content)

with zipfile.ZipFile(archive_bytes, "r") as z:
    with z.open("shapes.txt") as f:
        df_shapes = pd.read_csv(f)

df_shapes.head(20)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1014010000482329256,59.341873,18.118316,1,0.0
1,1014010000482329256,59.341854,18.118403,2,0.01
2,1014010000482329256,59.341754,18.118334,3,11.74
3,1014010000482329256,59.34126,18.118048,4,69.17
4,1014010000482329256,59.34093,18.116765,5,150.88
5,1014010000482329256,59.340944,18.116479,6,167.29
6,1014010000482329256,59.340572,18.115033,7,259.42
7,1014010000482329256,59.340453,18.11487,8,275.54
8,1014010000482329256,59.340353,18.11426,9,312.01
9,1014010000482329256,59.341099,18.113851,10,398.27


In [21]:
archive_bytes = io.BytesIO(request_static.content)

with zipfile.ZipFile(archive_bytes, "r") as z:
    with z.open("stops.txt") as f:
        df_stops = pd.read_csv(f)

df_stops.head(20)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,platform_code
0,9021001000101000,Stavsnäs,59.286405,18.7047,1,,
1,9021001000102000,Styrsvik,59.280184,18.731426,1,,
2,9021001000103000,Nämdöböte,59.208731,18.74006,1,,
3,9021001000104000,Aspö,59.215843,18.75341,1,,
4,9021001000105000,Idöborg,59.204309,18.757769,1,,
5,9021001000106000,Östanvik,59.197456,18.739825,1,,
6,9021001000107000,Kalkberget,59.191628,18.719581,1,,
7,9021001000108000,Västanvik,59.187559,18.706893,1,,
8,9021001000109000,Solvik,59.184758,18.700458,1,,
9,9021001000110000,Orrön,59.18051,18.700629,1,,


In [24]:
df_bus["stop_id"] = df_bus["stop_id"].astype("string")
df_stops["stop_id"] = df_stops["stop_id"].astype("string")

df_global = df_bus.merge(df_stops, on="stop_id", how="left")
df_global.sample(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bus["stop_id"] = df_bus["stop_id"].astype("string")


Unnamed: 0,entity_id,trip_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,stop_uncertainty,...,route_short_name,route_long_name,route_type,route_desc,stop_name,stop_lat,stop_lon,location_type,parent_station,platform_code
652330,14010516762564309,14010000664246596,20250315,0,9031001001007221,9,9022001010373001,272.0,1741995000.0,0.0,...,3,,700.0,blåbuss,Polhemsgatan,59.330021,18.035851,0,9021001000000000.0,
844919,14010516655107176,14010000672924232,20250315,0,9031001001508552,18,9022001015753002,99.0,1741996000.0,0.0,...,315,,700.0,,Hillesberg,59.375383,17.69652,0,9021001000000000.0,
1173062,14010516611398281,14010000621494172,20250315,0,9031001003007258,21,9022001080563002,-167.0,1741997000.0,0.0,...,832,,700.0,,Idunvägen,59.163457,18.145077,0,9021001000000000.0,
395682,14010516678578186,14010000673294131,20250315,0,9031001003005532,5,9022001070132001,128.0,1741994000.0,0.0,...,744,,700.0,,Lidavägen,59.19521,18.00463,0,9021001000000000.0,
757213,14010516466804936,14010000669427205,20250315,0,9031001001508558,49,9022001015344001,115.0,1741998000.0,0.0,...,317,,700.0,,Ekerö centrum,59.290163,17.810422,0,9021001000000000.0,
17512,14010516897425837,14010000685561703,20250314,0,9031001003003883,32,9022001041407001,-148.0,1741995000.0,0.0,...,433,,700.0,,Stavsnäs by,59.287271,18.688902,0,9021001000000000.0,
858545,14010516835161072,14010000676771362,20250315,0,9031001003001781,10,9022001070258002,-252.0,1741996000.0,0.0,...,715V,,700.0,,Tumba station,59.199829,17.834698,0,9021001000000000.0,B
165782,14010516629566419,14010000671986683,20250314,0,9031001004505160,6,9022001060109001,142.0,1741994000.0,0.0,...,604,,700.0,,Enmans väg,59.423547,18.043285,0,9021001000000000.0,
1176402,14010516611751853,14010000573391027,20250315,0,9031001003003903,14,9022001080105001,-61.0,1741997000.0,0.0,...,807,,700.0,,Tyrestavägen,59.179988,18.179563,0,9021001000000000.0,
1036836,14010516610992164,14010000562053166,20250315,0,9031001003003139,17,9022001040048002,0.0,1741998000.0,0.0,...,401,,700.0,,Stavsborgsskolan,59.257687,18.180421,0,9021001000000000.0,


In [39]:
df_global.columns

Index(['entity_id', 'trip_id', 'start_date', 'schedule_relationship',
       'vehicle_id', 'stop_sequence', 'stop_id', 'stop_arrival_delay',
       'stop_arrival_time', 'stop_uncertainty', 'departure_delay',
       'departure_time', 'departure_uncertainty', 'timestamp', 'route_id',
       'service_id', 'trip_headsign', 'direction_id', 'shape_id', 'agency_id',
       'route_short_name', 'route_long_name', 'route_type', 'route_desc',
       'stop_name', 'stop_lat', 'stop_lon', 'location_type', 'parent_station',
       'platform_code'],
      dtype='object')

In [None]:
df_final = df_global.drop(columns=["route_type", "route_desc", "trip_headsign", "stop_name", "location_type", "route_long_name", "parent_station", "platform_code"])
df_final

Unnamed: 0,entity_id,trip_id,start_date,schedule_relationship,vehicle_id,stop_sequence,stop_id,stop_arrival_delay,stop_arrival_time,stop_uncertainty,...,timestamp,route_id,service_id,trip_headsign,direction_id,shape_id,agency_id,route_short_name,stop_lat,stop_lon
0,14010516897425752,14010000685561305,20250314,0,9031001003003885,1,9022001041441002,-131.0,1.741984e+09,0.0,...,1741992858,9011001043300000,559.0,,0.0,1.014010e+18,1.401000e+16,433,59.313241,18.710174
1,14010516701138057,14010000673650562,20250314,0,9031001003007242,56,9022001080389001,354.0,1.741993e+09,0.0,...,1741993174,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,59.227323,18.223415
2,14010516701138057,14010000673650562,20250314,0,9031001003007242,57,9022001080391001,380.0,1.741993e+09,0.0,...,1741993174,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,59.229433,18.224001
3,14010516701138057,14010000673650562,20250314,0,9031001003007242,58,9022001080393001,375.0,1.741993e+09,0.0,...,1741993174,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,59.230134,18.219161
4,14010516701138057,14010000673650562,20250314,0,9031001003007242,59,9022001080395001,368.0,1.741993e+09,0.0,...,1741993174,9011001081900000,1.0,,1.0,1.014010e+18,1.401000e+16,819,59.231038,18.214168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235433,14010516719104362,14010000671939377,20250315,0,9031001004505158,45,9022001010346002,123.0,1.741998e+09,0.0,...,1741997651,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,59.349929,18.025418
1235434,14010516719104362,14010000671939377,20250315,0,9031001004505158,46,9022001010357001,94.0,1.741998e+09,0.0,...,1741997651,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,59.348664,18.028293
1235435,14010516719104362,14010000671939377,20250315,0,9031001004505158,47,9022001010359003,86.0,1.741998e+09,0.0,...,1741997651,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,59.345396,18.033970
1235436,14010516719104362,14010000671939377,20250315,0,9031001004505158,48,9022001010363008,51.0,1.741998e+09,0.0,...,1741997651,9011001069700000,33.0,,1.0,1.014010e+18,1.401000e+16,697,59.339344,18.039321


In [27]:
bus_numbers = df_final["route_short_name"].unique()
print(bus_numbers)

['433' '819' '555' '676' '858' '777' '176' '422' '784' '474' '783' '677'
 '579' '875' '57' '471' '26M' '161' '807' '830' '1' '172' '670' '541'
 '641' '832' '144' '540' '564H' '414' '744' '4' '708' '710' '173' '574'
 '854' '824' '165' '177' '835' '517' '118' '119' '163' '179' '873' '129'
 '113' '831' '409' '704' '61' '552V' '178' '753' '575' '53' '816' '514'
 '401' '713' '553V' '752' '564V' '527' '534' '834' '525' '3' '54' '6' '2'
 '188' '559' '607' '610' '533' '117' '801' '606' '702' '545' '637' '605'
 '25F' '116' '838' '758' '601' '69' '611' '629' '751' '683' '829' '839'
 '436' '402' '513' '181' '25M' '315' '656' '206' '507' '115' '647' '568'
 '665' '787' '160' '66' '536' '721X' '124' '552H' '437' '633' '142' '553H'
 '802' '544' '754' '180' '547' '542' '759' '844' '127' '421' '707' '725'
 '175' '715V' '50' '639' '430' '716' '133' '114' '528' '134' '143' '570'
 '548' '718' '55' '67' '604' '205' '201' '615' '727' '520' '846' '780X'
 '550' '506' '509' '515' '504' '562' '580' '526' '584' 

In [34]:
bus_number_koda = ['433','819','555','676','858','777','176','422','784','474','783','677','579','875','57','471','26M','161','807','830','1','172','670','541','641','832','144','540','564H','414','744','4','708','710','173','574','854','824','165','177','835','517','118','119','163','179','873','129','113','831','409','704','61','552V','178','753','575','53','816','514','401','713','553V','752','564V','527','534','834','525','3','54','6','2','188','559','607','610','533','117','801','606','702','545','637','605','25F','116','838','758','601','69','611','629','751','683','829','839','436','402','513','181','25M','315','656','206','507','115','647','568','665','787','160','66','536','721X','124','552H','437','633','142','553H','802','544','754','180','547','542','759','844','127','421','707','725','175','715V','50','639','430','716','133','114','528','134','143','570','548','718','55','67','604','205','201','615','727','520','846','780X','550','506','509','515','504','562','580','526','584','623H','682','582','317','303','512','714','837','501','684','112','504Y','203','531','418','622','581','147','212','505','723','617','843','141','443','429','849','523','657','529','697','836','690','721','417','795','691','695','623V','847','311','96','491','428','396','696','309','496','91','591','291','440','94','788','699','792']
bus_numbers_GTFS = ['444','783','677','865','636','645','579','176','172','626','707','708','676','555','840','177','541','422','848','161','819','173','163','434','540','179','514X','639','26M','702','178','758','4','552H','57','564H','835','727','641','474','780X','3','777','740','440','1','66','742','74','839','787','751','710','753','713','428X','25M','637','610','601','524','430X','552V','621','824','830','118','76','517','165','144','509','807','471','61','849','744','546H','113','119','705','55','831','506','670','53','567','67','238','2','519','607','627','564V','553V','832','544','558','409','542','340','873','559','875','816','858','754','414','433','504','438','632','520','652','515','833','129','709','788','181','629','605','65','837','135','802','801','401','6','615','611','160','756','533','660','465','50','410','180','117','175','309','134','560','580','852','845','112','550','204','665','402','562','714','508','350','437','684','664','5','188','616','523','443','553H','704','114','69','71T','659','54','536','505','854','606','56','142','164','604','602','143','688','829','185V','206','205','201','568','575','612','631X','141','838','582','417','750','512','442','525','584','634','655','547','759','844','721X','302','421','531','124','501','821','570','748','147','184','683','545','529','718','752','623V','723','834','847','548','116','583','738','658','534','715V','317','526','846','682','719','622','716','778','617','458','127','503','461','657','633']

In [36]:
bus_number_koda = set(bus_number_koda)
bus_numbers_GTFS = set(bus_numbers_GTFS)

correspondances = list(set(bus_number_koda).intersection(bus_numbers_GTFS))
print(sorted(correspondances))

['1', '112', '113', '114', '116', '117', '118', '119', '124', '127', '129', '134', '141', '142', '143', '144', '147', '160', '161', '163', '165', '172', '173', '175', '176', '177', '178', '179', '180', '181', '188', '2', '201', '205', '206', '25M', '26M', '3', '309', '317', '4', '401', '402', '409', '414', '417', '421', '422', '433', '437', '440', '443', '471', '474', '50', '501', '504', '505', '506', '509', '512', '515', '517', '520', '523', '525', '526', '529', '53', '531', '533', '534', '536', '54', '540', '541', '542', '544', '545', '547', '548', '55', '550', '552H', '552V', '553H', '553V', '555', '559', '562', '564H', '564V', '568', '57', '570', '575', '579', '580', '582', '584', '6', '601', '604', '605', '606', '607', '61', '610', '611', '615', '617', '622', '623V', '629', '633', '637', '639', '641', '657', '66', '665', '67', '670', '676', '677', '682', '683', '684', '69', '702', '704', '707', '708', '710', '713', '714', '715V', '716', '718', '721X', '723', '727', '744', '751', '

In [38]:
print(len(correspondances))
print(len(bus_numbers_GTFS))
print(len(bus_number_koda))

177
247
222
