# Task 2 – Investigation Queries on DWH

All SQL-statements are in located in `sql/analysis/*.sql`. 
Furthermore, this notebook includes some queries that were not required for the task, but we liked to investigate our data a little further.

In [1]:
# Notebook setup: allow imports from ../src
import sys
from pathlib import Path

repo_root = Path("..").resolve()
if repo_root.as_posix() not in sys.path:
    sys.path.insert(0, repo_root.as_posix())

In [2]:
from src.timetable_etl.config import Settings
from src.timetable_etl.db import connect
from src.timetable_etl.sql import load_sql

settings = Settings.from_env()
conn = connect(settings)

## (Not Task Relevant): Random Stop

This query yields a random stop. We used this query as a little test to verify that our pipeline actually produces the intended.

In [3]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/random_stop.sql"))
    row = cur.fetchone()

print(row)

('-8470659884147991810-2509271851-11', 8089091, datetime.datetime(2025, 9, 27, 19, 23, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), datetime.datetime(2025, 9, 27, 19, 23, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), None, None, None, None)


## (Not Task Relevant): Visaluzation of all Stations

This query was helpful for us to visualize all stations and verify that the coordinates were extracted correctly.

In [4]:
import folium

with conn.cursor() as cur:
    cur.execute(load_sql("analysis/stations_in_bbox.sql"))
    rows = cur.fetchall()

BERLIN_BBOX = {"lat_min": 52.3, "lat_max": 52.7, "lon_min": 13.0, "lon_max": 13.85}
rows = [r for r in rows if r[2] is not None and r[3] is not None and (BERLIN_BBOX["lat_min"] <= r[2] <= BERLIN_BBOX["lat_max"]) and (BERLIN_BBOX["lon_min"] <= r[3] <= BERLIN_BBOX["lon_max"])]

if rows:
    lats = [r[2] for r in rows]
    lons = [r[3] for r in rows]
    center = [sum(lats) / len(lats), sum(lons) / len(lons)]
else:
    center = [52.52, 13.405]

m = folium.Map(location=center, zoom_start=11, tiles="OpenStreetMap")
for eva, name, lat, lon in rows[:2000]:
    folium.CircleMarker([lat, lon], radius=4, popup=f"{name} ({eva})").add_to(m)

m

## Task 2.1 Return Station identifier and coordinates (given name)

In [5]:
def get_station_by_name(conn, station_name: str):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/get_station_by_name.sql"), {"name": station_name})
        row = cur.fetchone()
    if row is None:
        return None
    eva, lat, lon = row
    return {"eva": eva, "latitude": lat, "longitude": lon}

print(get_station_by_name(conn, "Hackescher Markt"))

{'eva': 8089017, 'latitude': 52.522622, 'longitude': 13.402364}


## Task 2.2 Return closest Station

In [6]:
def nearest_station(conn, lat: float, lon: float):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/nearest_station.sql"), {"lat": lat, "lon": lon})
        return cur.fetchone()

lat, lon = 52.5, 13.5
row = nearest_station(conn, lat, lon)
print(row)

(8089006, 'Betriebsbahnhof Berlin-Rummelsburg', 52.493827, 13.497786, 702.578990929422)


## Task 2.3 Amount of Cancelled Stations per Snapshot

In [7]:
snap = "25100113"  # YYMMDDHH
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/cancelled_stops_in_snapshot.sql"), {"snap": snap})
    row = cur.fetchone()
print(row)

(11,)


## Task 2.4 Average Delay by Station

In [8]:
def avg_delay_for_station(conn, station_name: str):
    with conn.cursor() as cur:
        cur.execute(load_sql("analysis/avg_departure_delay_for_station.sql"), {"station_name": station_name})
        return cur.fetchone()

station = "Berlin Hauptbahnhof"
row = avg_delay_for_station(conn, station)

if row is None:
    print("Station name is invalid")
else:
    eva, name, n, avg_s = row
    print(f"{eva} | {name} | n={n} | Ø={avg_s/60:.2f} min")


8011160 | Berlin Hauptbahnhof | n=10697 | Ø=9.57 min


## Task 2.4 Average Delay for all Stations

In [9]:
with conn.cursor() as cur:
    cur.execute(load_sql("analysis/avg_departure_delay_by_station.sql"))
    rows = cur.fetchall()

for eva, name, n, avg_s in rows[:20]:
    print(f"{eva} | {name:40s} | n={n:7d} | Avg Delay={avg_s/60:7.2f} min")

8010404 | Berlin-Spandau                           | n=  10801 | Avg Delay=   9.92 min
8011160 | Berlin Hauptbahnhof                      | n=  10697 | Avg Delay=   9.57 min
8011113 | Berlin Südkreuz                          | n=   4780 | Avg Delay=   6.47 min
8010406 | Berlin Zoologischer Garten               | n=   7209 | Avg Delay=   5.86 min
8010255 | Berlin Ostbahnhof                        | n=   7984 | Avg Delay=   5.03 min
8010405 | Berlin-Wannsee                           | n=   4536 | Avg Delay=   4.87 min
8010403 | Berlin-Charlottenburg                    | n=   3533 | Avg Delay=   4.80 min
8011102 | Berlin Gesundbrunnen                     | n=   6582 | Avg Delay=   4.13 min
8011155 | Alexanderplatz                           | n=   6299 | Avg Delay=   3.99 min
8011306 | Berlin-Friedrichstraße                   | n=   6586 | Avg Delay=   3.91 min
8011162 | Berlin Ostkreuz                          | n=  10124 | Avg Delay=   3.62 min
8011118 | Berlin Potsdamer Platz           

In [10]:
conn.close()