# Task 2 – Testing / Analyse Queries

Alle SQL-Statements liegen in `sql/analysis/*.sql` und werden hier nur geladen.

In [1]:
# Notebook setup: allow imports from ../src
import sys
from pathlib import Path

repo_root = Path("..").resolve()
src_path = (repo_root / "src").as_posix()
if src_path not in sys.path:
    sys.path.insert(0, src_path)
print("Using repo root:", repo_root)


Using repo root: C:\CodingProjects\DIA\DBahn-berlin\repo


In [None]:
from src.timetable_etl.config import Settings
from src.timetable_etl.db import connect
from src.timetable_etl.sql import load_sql

settings = Settings.from_env()
conn = connect(settings)


## 1) Random Stop

In [None]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/random_stop.sql"))
    row = cur.fetchone()

print("PK =", row[0])
print("rest =", {"eva": row[1], "ar_ts": row[2], "dp_ts": row[3]})

## 2) Durchschnittliche Abfahrtsverspätung global

In [None]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/avg_departure_delay_global.sql"))
    avg_delay_seconds, n = cur.fetchone()

print("N =", n)
print("Ø Abfahrtsverspätung (Sek.) =", round(avg_delay_seconds, 2) if avg_delay_seconds is not None else None)
print("Ø Abfahrtsverspätung (Min.) =", round((float(avg_delay_seconds) / 60.0), 2) if avg_delay_seconds is not None else None)


## 3) Ø Abfahrtsverspätung nach Station (größte zuerst)

In [None]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/avg_departure_delay_by_station.sql"))
    rows = cur.fetchall()

for eva, name, n, avg_s in rows[:20]:
    print(f"{eva} | {name:40s} | n={n:7d} | Ø={avg_s/60:7.2f} min")


## 4) Ø Abfahrtsverspätung für eine Station (ILIKE)

In [None]:
def avg_delay_for_station(conn, station_name: str):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/avg_departure_delay_for_station.sql"), {"station_name": station_name})
        return cur.fetchone()

station = "Berlin Hauptbahnhof"
row = avg_delay_for_station(conn, station)

if row is None:
    print("Keine passende Station oder keine gültigen Stop-Daten gefunden.")
else:
    eva, name, n, avg_s = row
    print(f"{eva} | {name} | n={n} | Ø={avg_s/60:.2f} min")


## 5) Stationen visualisieren (Folium)

In [None]:
import folium

with conn.cursor() as cur:
    cur.execute(load_sql("analysis/stations_in_bbox.sql"))
    rows = cur.fetchall()

# Optional: Berlin-BBox filter
BERLIN_BBOX = {"lat_min": 52.3, "lat_max": 52.7, "lon_min": 13.0, "lon_max": 13.85}
rows = [r for r in rows if r[2] is not None and r[3] is not None and (BERLIN_BBOX["lat_min"] <= r[2] <= BERLIN_BBOX["lat_max"]) and (BERLIN_BBOX["lon_min"] <= r[3] <= BERLIN_BBOX["lon_max"])]

if rows:
    lats = [r[2] for r in rows]
    lons = [r[3] for r in rows]
    center = [sum(lats) / len(lats), sum(lons) / len(lons)]
else:
    center = [52.52, 13.405]

m = folium.Map(location=center, zoom_start=11, tiles="OpenStreetMap")
for eva, name, lat, lon in rows[:2000]:
    folium.CircleMarker([lat, lon], radius=4, popup=f"{name} ({eva})").add_to(m)

m


## 6) Nächste Station zu einer Koordinate

In [None]:
def nearest_station(conn, lat: float, lon: float):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/nearest_station.sql"), {"lat": lat, "lon": lon})
        return cur.fetchone()

lat, lon = 52.5200, 13.4050
row = nearest_station(conn, lat, lon)
print(row)


## 7) Station nach Name suchen

In [None]:
def get_station_by_name(conn, station_name: str):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/get_station_by_name.sql"), {"name": station_name})
        row = cur.fetchone()
    if row is None:
        return None
    eva, lat, lon = row
    return {"eva": eva, "latitude": lat, "longitude": lon}

print(get_station_by_name(conn, "Hackescher Markt"))


## 8) Cancelled stops in Snapshot (YYMMDDHH)

In [None]:
snap = "25100113"  # YYMMDDHH
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/cancelled_stops_in_snapshot.sql"), {"snap": snap})
    row = cur.fetchone()
print(row)


In [None]:
conn.close()
print("Done.")
