In [1]:
import logging
from datetime import datetime, timezone, timedelta
from collections import defaultdict
import random
import gc
import json
import pandas as pd
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv

load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")


In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
logger = logging.getLogger("RUN_TRANSPORT")

In [21]:
def load_parquet_to_neon(table_name, data_array) -> None:
    engine = create_engine(DATABASE_URL)

    df = pd.DataFrame(data_array)
    print(len(df))

    df.to_sql(
        table_name,
        engine,
        if_exists="append",
        index=False,
        method="multi",
        chunksize=10_000,
    )

    logger.info("OK: %s chargée", table_name)

    df = pd.read_sql("SELECT COUNT(*) FROM stg_transport_archive", engine)
    print(df.head())

In [26]:
import json
from pathlib import Path
from datetime import datetime, timezone, timedelta

DATE_BEGIN = "2025-09-22"
DATE_END = "2025-09-28"

base_path = f"../data/S3/"

#json_path = DATA_DIR / "history_transport_2025-03-15-2025-03-16.json"

def transform_S3_to_neon(file_name):
    json_path = base_path + file_name
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    KEYS_TO_REMOVE = {"trip_id", "route_id", "start_date", "vehicle_id", "arrival_time", "departure_time"}

    for row in data:
        for key in KEYS_TO_REMOVE:
            row.pop(key, None)

        ts = row.get("timestamp")

        ts_hour = ((ts + 1800) // 3600) * 3600
        row["timestamp_hour"] = ts_hour

        row["timestamp_rounded"] = datetime.fromtimestamp(ts_hour, tz=timezone.utc).isoformat()
        row["hour"] = (row["timestamp_hour"] // 3600) % 24
        row.pop("timestamp_hour", None)
        row.pop("timestamp", None)
        
        row["bus_nbr"] = "541"
    
    return data



In [27]:
start = datetime.strptime(DATE_BEGIN, "%Y-%m-%d")
end = datetime.strptime(DATE_END, "%Y-%m-%d")

datas = []
bad_days = []

current = start

# VERIFIER QU'IL ENVOIE TOUTES LES DONNEES EN DB !!!!
# Transform data to database
json_name = f"history_transport_{DATE_BEGIN}-{DATE_END}.json"
datas_S3 = transform_S3_to_neon(json_name)


logger.info(datas_S3[:20])
logger.info(len(datas_S3))

#LOAD TO NEON
logger = logging.getLogger("NEON LOADER")

load_parquet_to_neon("stg_transport_archive", datas_S3)

2026-01-03 15:46:18,417 | INFO | NEON LOADER | [{'direction_id': '1', 'stop_sequence': 24, 'arrival_delay': 0, 'departure_delay': 0, 'timestamp_rounded': '2025-09-26T08:00:00+00:00', 'hour': 8, 'bus_nbr': '541'}, {'direction_id': '1', 'stop_sequence': 30, 'arrival_delay': 88, 'departure_delay': 105, 'timestamp_rounded': '2025-09-26T05:00:00+00:00', 'hour': 5, 'bus_nbr': '541'}, {'direction_id': '0', 'stop_sequence': 30, 'arrival_delay': 36, 'departure_delay': 36, 'timestamp_rounded': '2025-09-26T15:00:00+00:00', 'hour': 15, 'bus_nbr': '541'}, {'direction_id': '0', 'stop_sequence': 17, 'arrival_delay': 128, 'departure_delay': 139, 'timestamp_rounded': '2025-09-26T18:00:00+00:00', 'hour': 18, 'bus_nbr': '541'}, {'direction_id': '0', 'stop_sequence': 16, 'arrival_delay': 105, 'departure_delay': 110, 'timestamp_rounded': '2025-09-26T04:00:00+00:00', 'hour': 4, 'bus_nbr': '541'}, {'direction_id': '1', 'stop_sequence': 9, 'arrival_delay': 3, 'departure_delay': 6, 'timestamp_rounded': '2025-0

144


2026-01-03 15:46:18,865 | INFO | NEON LOADER | OK: stg_transport_archive chargée


   count
0   1872
