# Task 1 â€“ ETL Pipeline

The first cells only make sure that the file root is set correct and the database connection is build.

In [None]:
import sys
from pathlib import Path

repo_root = Path("..").resolve()
if repo_root.as_posix() not in sys.path:
    sys.path.insert(0, repo_root.as_posix())

In [None]:
from src.timetable_etl.config import Settings
from src.timetable_etl.db import connect

settings = Settings.from_env()
print(settings)
conn = connect(settings)

## Step 0: Deduplication Investigation

We just wanted to show the we have investigated the duplicate keys exist in the ``stop_id`` attribute and thus, deduplication is necessary.

In [None]:
from src.timetable_etl.dedup_invest import count_stop_id_duplicates
import os
count_stop_id_duplicates(os.path.join(repo_root, "timetables"))

In [None]:
count_stop_id_duplicates(os.path.join(repo_root, "timetable_changes"))

## Step 1: Build Station Table

The imported function reads the ``stations_data.json`` and builds the station dimension table from it.

In [None]:
from src.timetable_etl.stations import import_stationen

n = import_stationen(conn, settings.station_json_path)
print("Upserted station rows:", n)

## Step 2: Build other Dimension Tables + Fact Table

The imported function builds the other dimension tables. Also, it iterates the ``timetables`` directory and inserts the planned values into the stops table.

In [None]:
from src.timetable_etl.stops_planned import import_stops_from_archives

planned_res = import_stops_from_archives(
    conn,
    settings.planned_archives_path,
    pattern=settings.archive_pattern,
    timezone=settings.timezone,
    match_threshold=settings.match_threshold,
    ambiguity_delta=settings.ambiguity_delta,
    batch_size=settings.planned_batch_size,
)
planned_res

## Step 3: Insert Change Values to Fact Table

The imported function iterates the ``timetable_changes`` directory and updates the stops table accordingly.

In [None]:
from src.timetable_etl.stops_changed import process_change_archives

changed_res = process_change_archives(
    conn,
    settings.changes_archives_path,
    pattern=settings.archive_pattern,
    batch_size=settings.change_batch_size,
)
changed_res


In [None]:
conn.close()