# Task 2 – Testing / Analyse Queries

Alle SQL-Statements liegen in `sql/analysis/*.sql` und werden hier nur geladen.

In [1]:
# Notebook setup: allow imports from ../src
import sys
from pathlib import Path

repo_root = Path("..").resolve()
if repo_root.as_posix() not in sys.path:
    sys.path.insert(0, repo_root.as_posix())
print("Using repo root:", repo_root)

Using repo root: C:\CodingProjects\DIA\DBahn-berlin\repo


In [2]:
from src.timetable_etl.config import Settings
from src.timetable_etl.db import connect
from src.timetable_etl.sql import load_sql

settings = Settings.from_env()
conn = connect(settings)


## 1) Random Stop

In [3]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/random_stop.sql"))
    row = cur.fetchone()

print(row)

('2797420159025155747-2509191044-25', 8089095, datetime.datetime(2025, 9, 19, 11, 37, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), datetime.datetime(2025, 9, 19, 11, 38, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), datetime.datetime(2025, 9, 19, 11, 39, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), datetime.datetime(2025, 9, 19, 11, 39, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), None, None)


## 2) Durchschnittliche Abfahrtsverspätung global

In [4]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/avg_departure_delay_global.sql"))
    avg_delay_seconds, n = cur.fetchone()

print("N =", n)
print("Ø Abfahrtsverspätung (Sek.) =", round(avg_delay_seconds, 2) if avg_delay_seconds is not None else None)
print("Ø Abfahrtsverspätung (Min.) =", round((float(avg_delay_seconds) / 60.0), 2) if avg_delay_seconds is not None else None)


N = 123.6616516491692155
Ø Abfahrtsverspätung (Sek.) = 870984
Ø Abfahrtsverspätung (Min.) = 14516.4


## 3) Ø Abfahrtsverspätung nach Station (größte zuerst)

In [5]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/avg_departure_delay_by_station.sql"))
    rows = cur.fetchall()

for eva, name, n, avg_s in rows[:20]:
    print(f"{eva} | {name:40s} | n={n:7d} | Ø={avg_s/60:7.2f} min")


8010404 | Berlin-Spandau                           | n=  10801 | Ø=   9.92 min
8011160 | Berlin Hauptbahnhof                      | n=  10697 | Ø=   9.57 min
8011113 | Berlin Südkreuz                          | n=   4780 | Ø=   6.47 min
8010406 | Berlin Zoologischer Garten               | n=   7209 | Ø=   5.86 min
8010255 | Berlin Ostbahnhof                        | n=   7984 | Ø=   5.03 min
8010405 | Berlin-Wannsee                           | n=   4536 | Ø=   4.87 min
8010403 | Berlin-Charlottenburg                    | n=   3533 | Ø=   4.80 min
8011102 | Berlin Gesundbrunnen                     | n=   6582 | Ø=   4.13 min
8011155 | Alexanderplatz                           | n=   6299 | Ø=   3.99 min
8011306 | Berlin-Friedrichstraße                   | n=   6586 | Ø=   3.91 min
8011162 | Berlin Ostkreuz                          | n=  10124 | Ø=   3.62 min
8011118 | Berlin Potsdamer Platz                   | n=   4349 | Ø=   3.55 min
8011041 | Berlin-Lichterfelde Ost                  |

## 4) Ø Abfahrtsverspätung für eine Station (ILIKE)

In [6]:
def avg_delay_for_station(conn, station_name: str):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/avg_departure_delay_for_station.sql"), {"station_name": station_name})
        return cur.fetchone()

station = "Berlin Hauptbahnhof"
row = avg_delay_for_station(conn, station)

if row is None:
    print("Keine passende Station oder keine gültigen Stop-Daten gefunden.")
else:
    eva, name, n, avg_s = row
    print(f"{eva} | {name} | n={n} | Ø={avg_s/60:.2f} min")


8011160 | Berlin Hauptbahnhof | n=10697 | Ø=9.57 min


## 5) Stationen visualisieren (Folium)

In [7]:
import folium

with conn.cursor() as cur:
    cur.execute(load_sql("analysis/stations_in_bbox.sql"))
    rows = cur.fetchall()

# Optional: Berlin-BBox filter
BERLIN_BBOX = {"lat_min": 52.3, "lat_max": 52.7, "lon_min": 13.0, "lon_max": 13.85}
rows = [r for r in rows if r[2] is not None and r[3] is not None and (BERLIN_BBOX["lat_min"] <= r[2] <= BERLIN_BBOX["lat_max"]) and (BERLIN_BBOX["lon_min"] <= r[3] <= BERLIN_BBOX["lon_max"])]

if rows:
    lats = [r[2] for r in rows]
    lons = [r[3] for r in rows]
    center = [sum(lats) / len(lats), sum(lons) / len(lons)]
else:
    center = [52.52, 13.405]

m = folium.Map(location=center, zoom_start=11, tiles="OpenStreetMap")
for eva, name, lat, lon in rows[:2000]:
    folium.CircleMarker([lat, lon], radius=4, popup=f"{name} ({eva})").add_to(m)

m

## 6) Nächste Station zu einer Koordinate

In [8]:
def nearest_station(conn, lat: float, lon: float):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/nearest_station.sql"), {"lat": lat, "lon": lon})
        return cur.fetchone()

lat, lon = 52.5, 13.5
row = nearest_station(conn, lat, lon)
print(row)


(8089006, 'Betriebsbahnhof Berlin-Rummelsburg', 52.493827, 13.497786, 702.578990929422)


## 7) Station nach Name suchen

In [9]:
def get_station_by_name(conn, station_name: str):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/get_station_by_name.sql"), {"name": station_name})
        row = cur.fetchone()
    if row is None:
        return None
    eva, lat, lon = row
    return {"eva": eva, "latitude": lat, "longitude": lon}

print(get_station_by_name(conn, "Hackescher Markt"))


{'eva': 8089017, 'latitude': 52.522622, 'longitude': 13.402364}


## 8) Cancelled stops in Snapshot (YYMMDDHH)

In [10]:
snap = "25100113"  # YYMMDDHH
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/cancelled_stops_in_snapshot.sql"), {"snap": snap})
    row = cur.fetchone()
print(row)


(11,)


In [11]:
conn.close()